DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
This document is currently split between _v003 and v_003_a due to the need to keep the number of DLL that it opens below the hard-coded maximum. This introductory section needs to be re-written, and the contents consolidated, at a future date.
The original DataCamp_Insights_v001 and DataCamp_Insights_v002 documents have been split for this document:
Chapter 1 - Dates and Times in R
Introduction to dates - including the built-in methods for R:
Why use dates?
What about times?
Why lubridate?
Example code includes:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# The date R 3.0.0 was released
x <- "2013-04-03"
# Examine structure of x
str(x)
## chr "2013-04-03"
# Use as.Date() to interpret x as a date
x_date <- as.Date(x)
# Examine structure of x_date
str(x_date)
## Date[1:1], format: "2013-04-03"
# Store April 10 2014 as a Date
april_10_2014 <- as.Date("2014-04-10")
# Load the readr package
library(readr)
# Use read_csv() to import rversions.csv
releases <- read_csv("./RInputFiles/rversions.csv")
## Parsed with column specification:
## cols(
## major = col_integer(),
## minor = col_integer(),
## patch = col_integer(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
# Examine the structure of the date column
str(releases$date)
## Date[1:105], format: "1997-12-04" "1997-12-21" "1998-01-10" "1998-03-14" ...
# Load the anytime package
library(anytime)
# Various ways of writing Sep 10 2009
sep_10_2009 <- c("September 10 2009", "2009-09-10", "10 Sep 2009", "09-10-2009")
# Use anytime() to parse sep_10_2009
anytime(sep_10_2009)
## [1] "2009-09-10 CDT" "2009-09-10 CDT" "2009-09-10 CDT" "2009-09-10 CDT"
# Set the x axis to the date column
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major)))
# Limit the axis to between 2010-01-01 and 2014-01-01
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major))) +
xlim(as.Date("2010-01-01"), as.Date("2014-01-01"))
## Warning: Removed 87 rows containing missing values (geom_path).
# Specify breaks every ten years and labels with "%Y"
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major))) +
scale_x_date(date_breaks = "10 years", date_labels = "%Y")
# Find the largest date
last_release_date <- max(releases$date)
# Filter row for last release
last_release <- filter(releases, date == last_release_date)
# Print last_release
last_release
## # A tibble: 1 x 7
## major minor patch date datetime time type
## <int> <int> <int> <date> <dttm> <time> <chr>
## 1 3 4 1 2017-06-30 2017-06-30 07:04:11 07:04 patch
# How long since last release?
Sys.Date() - last_release_date
## Time difference of 275 days
# Use as.POSIXct to enter the datetime
as.POSIXct("2010-10-01 12:12:00")
## [1] "2010-10-01 12:12:00 CDT"
# Use as.POSIXct again but set the timezone to `"America/Los_Angeles"`
as.POSIXct("2010-10-01 12:12:00", tz = "America/Los_Angeles")
## [1] "2010-10-01 12:12:00 PDT"
# Use read_csv to import rversions.csv
releases <- read_csv("./RInputFiles/rversions.csv")
## Parsed with column specification:
## cols(
## major = col_integer(),
## minor = col_integer(),
## patch = col_integer(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
# Examine structure of datetime column
str(releases$datetime)
## POSIXct[1:105], format: "1997-12-04 08:47:58" "1997-12-21 13:09:22" ...
# Import "cran-logs_2015-04-17.csv" with read_csv()
logs <- read_csv("./RInputFiles/cran-logs_2015-04-17.csv")
## Parsed with column specification:
## cols(
## datetime = col_datetime(format = ""),
## r_version = col_character(),
## country = col_character()
## )
# Print logs
logs
## # A tibble: 100,000 x 3
## datetime r_version country
## <dttm> <chr> <chr>
## 1 2015-04-16 22:40:19 3.1.3 CO
## 2 2015-04-16 09:11:04 3.1.3 GB
## 3 2015-04-16 17:12:37 3.1.3 DE
## 4 2015-04-18 12:34:43 3.2.0 GB
## 5 2015-04-16 04:49:18 3.1.3 PE
## 6 2015-04-16 06:40:44 3.1.3 TW
## 7 2015-04-16 00:21:36 3.1.3 US
## 8 2015-04-16 10:27:23 3.1.3 US
## 9 2015-04-16 01:59:43 3.1.3 SG
## 10 2015-04-18 15:41:32 3.2.0 CA
## # ... with 99,990 more rows
# Store the release time as a POSIXct object
release_time <- as.POSIXct("2015-04-16 07:13:33", tz = "UTC")
# When is the first download of 3.2.0?
logs %>%
filter(r_version == "3.2.0")
## # A tibble: 35,928 x 3
## datetime r_version country
## <dttm> <chr> <chr>
## 1 2015-04-18 12:34:43 3.2.0 GB
## 2 2015-04-18 15:41:32 3.2.0 CA
## 3 2015-04-18 14:58:41 3.2.0 IE
## 4 2015-04-18 16:44:45 3.2.0 US
## 5 2015-04-18 04:34:35 3.2.0 US
## 6 2015-04-18 22:29:45 3.2.0 CH
## 7 2015-04-17 16:21:06 3.2.0 US
## 8 2015-04-18 20:34:57 3.2.0 AT
## 9 2015-04-17 18:23:19 3.2.0 US
## 10 2015-04-18 03:00:31 3.2.0 US
## # ... with 35,918 more rows
# Examine histograms of downloads by version
ggplot(logs, aes(x = datetime)) +
geom_histogram() +
geom_vline(aes(xintercept = as.numeric(release_time)))+
facet_wrap(~ r_version, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Chapter 2 - Parsing and Manipulating Dates with lubridate
Parsing dates with lubridate:
Weather in Auckland (data from Weather Underground, METAR from Auckland airport):
Extracting parts of a datetime:
Rounding datetimes:
Example code includes:
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(readr)
library(dplyr)
library(ggplot2)
library(ggridges)
library(stringr)
# Parse x
x <- "2010 September 20th" # 2010-09-20
ymd(x)
## [1] "2010-09-20"
# Parse y
y <- "02.01.2010" # 2010-01-02
dmy(y)
## [1] "2010-01-02"
# Parse z
z <- "Sep, 12th 2010 14:00" # 2010-09-12T14:00
mdy_hm(z)
## [1] "2010-09-12 14:00:00 UTC"
# Specify an order string to parse x
x <- "Monday June 1st 2010 at 4pm"
parse_date_time(x, orders = "AmdyIp")
## [1] "2010-06-01 16:00:00 UTC"
# Specify order to include both "mdy" and "dmy"
two_orders <- c("October 7, 2001", "October 13, 2002", "April 13, 2003",
"17 April 2005", "23 April 2017")
parse_date_time(two_orders, orders = c("mdy", "dmy"))
## [1] "2001-10-07 UTC" "2002-10-13 UTC" "2003-04-13 UTC" "2005-04-17 UTC"
## [5] "2017-04-23 UTC"
# Specify order to include "dOmY", "OmY" and "Y"
short_dates <- c("11 December 1282", "May 1372", "1253")
parse_date_time(short_dates, orders = c("dOmY", "OmY", "Y"))
## [1] "1282-12-11 UTC" "1372-05-01 UTC" "1253-01-01 UTC"
# Import CSV with read_csv()
akl_daily_raw <- read_csv("./RInputFiles/akl_weather_daily.csv")
## Parsed with column specification:
## cols(
## date = col_character(),
## max_temp = col_integer(),
## min_temp = col_integer(),
## mean_temp = col_integer(),
## mean_rh = col_integer(),
## events = col_character(),
## cloud_cover = col_integer()
## )
# Print akl_daily_raw
akl_daily_raw
## # A tibble: 3,661 x 7
## date max_temp min_temp mean_temp mean_rh events cloud_cover
## <chr> <int> <int> <int> <int> <chr> <int>
## 1 2007-9-1 60 51 56 75 <NA> 4
## 2 2007-9-2 60 53 56 82 Rain 4
## 3 2007-9-3 57 51 54 78 <NA> 6
## 4 2007-9-4 64 50 57 80 Rain 6
## 5 2007-9-5 53 48 50 90 Rain 7
## 6 2007-9-6 57 42 50 69 <NA> 1
## 7 2007-9-7 59 41 50 77 <NA> 4
## 8 2007-9-8 59 46 52 80 <NA> 5
## 9 2007-9-9 55 50 52 88 Rain 7
## 10 2007-9-10 59 50 54 82 Rain 4
## # ... with 3,651 more rows
# Parse date
akl_daily <- akl_daily_raw %>%
mutate(date = ymd(date))
# Print akl_daily
akl_daily
## # A tibble: 3,661 x 7
## date max_temp min_temp mean_temp mean_rh events cloud_cover
## <date> <int> <int> <int> <int> <chr> <int>
## 1 2007-09-01 60 51 56 75 <NA> 4
## 2 2007-09-02 60 53 56 82 Rain 4
## 3 2007-09-03 57 51 54 78 <NA> 6
## 4 2007-09-04 64 50 57 80 Rain 6
## 5 2007-09-05 53 48 50 90 Rain 7
## 6 2007-09-06 57 42 50 69 <NA> 1
## 7 2007-09-07 59 41 50 77 <NA> 4
## 8 2007-09-08 59 46 52 80 <NA> 5
## 9 2007-09-09 55 50 52 88 Rain 7
## 10 2007-09-10 59 50 54 82 Rain 4
## # ... with 3,651 more rows
# Plot to check work
ggplot(akl_daily, aes(x = date, y = max_temp)) +
geom_line()
## Warning: Removed 1 rows containing missing values (geom_path).
# Import "akl_weather_hourly_2016.csv"
akl_hourly_raw <- read_csv("./RInputFiles/akl_weather_hourly_2016.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## month = col_integer(),
## mday = col_integer(),
## time = col_time(format = ""),
## temperature = col_double(),
## weather = col_character(),
## conditions = col_character(),
## events = col_character(),
## humidity = col_integer(),
## date_utc = col_datetime(format = "")
## )
# Print akl_hourly_raw
akl_hourly_raw
## # A tibble: 17,454 x 10
## year month mday time temperature weather conditions events humidity
## <int> <int> <int> <time> <dbl> <chr> <chr> <chr> <int>
## 1 2016 1 1 00:00 68.0 Clear Clear <NA> 68
## 2 2016 1 1 00:30 68.0 Clear Clear <NA> 68
## 3 2016 1 1 01:00 68.0 Clear Clear <NA> 73
## 4 2016 1 1 01:30 68.0 Clear Clear <NA> 68
## 5 2016 1 1 02:00 68.0 Clear Clear <NA> 68
## 6 2016 1 1 02:30 68.0 Clear Clear <NA> 68
## 7 2016 1 1 03:00 68.0 Clear Clear <NA> 68
## 8 2016 1 1 03:30 68.0 Cloudy Partly Cl~ <NA> 68
## 9 2016 1 1 04:00 68.0 Cloudy Scattered~ <NA> 68
## 10 2016 1 1 04:30 66.2 Cloudy Partly Cl~ <NA> 73
## # ... with 17,444 more rows, and 1 more variable: date_utc <dttm>
# Use make_date() to combine year, month and mday
akl_hourly <- akl_hourly_raw %>%
mutate(date = make_date(year = year, month = month, day = mday))
# Parse datetime_string
akl_hourly <- akl_hourly %>%
mutate(
datetime_string = paste(date, time, sep = "T"),
datetime = ymd_hms(datetime_string)
)
# Print date, time and datetime columns of akl_hourly
akl_hourly %>% select(date, time, datetime)
## # A tibble: 17,454 x 3
## date time datetime
## <date> <time> <dttm>
## 1 2016-01-01 00:00 2016-01-01 00:00:00
## 2 2016-01-01 00:30 2016-01-01 00:30:00
## 3 2016-01-01 01:00 2016-01-01 01:00:00
## 4 2016-01-01 01:30 2016-01-01 01:30:00
## 5 2016-01-01 02:00 2016-01-01 02:00:00
## 6 2016-01-01 02:30 2016-01-01 02:30:00
## 7 2016-01-01 03:00 2016-01-01 03:00:00
## 8 2016-01-01 03:30 2016-01-01 03:30:00
## 9 2016-01-01 04:00 2016-01-01 04:00:00
## 10 2016-01-01 04:30 2016-01-01 04:30:00
## # ... with 17,444 more rows
# Plot to check work
ggplot(akl_hourly, aes(x = datetime, y = temperature)) +
geom_line()
# Examine the head() of release_time
releases <- read_csv("./RInputFiles/rversions.csv")
## Parsed with column specification:
## cols(
## major = col_integer(),
## minor = col_integer(),
## patch = col_integer(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
release_time <- releases %>% pull(datetime)
head(release_time)
## [1] "1997-12-04 08:47:58 UTC" "1997-12-21 13:09:22 UTC"
## [3] "1998-01-10 00:31:55 UTC" "1998-03-14 19:25:55 UTC"
## [5] "1998-05-02 07:58:17 UTC" "1998-06-14 12:56:20 UTC"
# Examine the head() of the months of release_time
head(month(release_time))
## [1] 12 12 1 3 5 6
# Extract the month of releases
month(release_time) %>% table()
## .
## 1 2 3 4 5 6 7 8 9 10 11 12
## 5 6 8 18 5 16 4 7 2 15 6 13
# Extract the year of releases
year(release_time) %>% table()
## .
## 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
## 2 10 9 6 6 5 5 4 4 4 4 6 5 4 6
## 2012 2013 2014 2015 2016 2017
## 4 4 4 5 5 3
# How often is the hour before 12 (noon)?
mean(hour(release_time) < 12)
## [1] 0.752381
# How often is the release in am?
mean(am(release_time))
## [1] 0.752381
# Use wday() to tabulate release by day of the week
wday(releases$datetime) %>% table()
## .
## 1 2 3 4 5 6 7
## 3 29 9 12 18 31 3
# Add label = TRUE to make table more readable
wday(releases$datetime, label=TRUE) %>% table()
## .
## Sun Mon Tue Wed Thu Fri Sat
## 3 29 9 12 18 31 3
# Create column wday to hold labelled week days
releases$wday <- wday(releases$datetime, label=TRUE)
# Plot barchart of weekday by type of release
ggplot(releases, aes(x=wday)) +
geom_bar() +
facet_wrap(~ type, ncol = 1, scale = "free_y")
# Add columns for year, yday and month
akl_daily <- akl_daily %>%
mutate(
year = year(date),
yday = yday(date),
month = month(date, label=TRUE))
# Plot max_temp by yday for all years
ggplot(akl_daily, aes(x = yday, y = max_temp)) +
geom_line(aes(group = year), alpha = 0.5)
## Warning: Removed 1 rows containing missing values (geom_path).
# Examine distribtion of max_temp by month
ggplot(akl_daily, aes(x = max_temp, y = month, height = ..density..)) +
geom_density_ridges(stat = "density")
## Warning: Removed 10 rows containing non-finite values (stat_density).
# Create new columns hour, month and rainy
akl_hourly <- akl_hourly %>%
mutate(
hour = hour(datetime),
month = month(datetime, label=TRUE),
rainy = (weather == "Precipitation")
)
# Filter for hours between 8am and 10pm (inclusive)
akl_day <- akl_hourly %>%
filter(hour >= 8, hour <= 22)
# Summarise for each date if there is any rain
rainy_days <- akl_day %>%
group_by(month, date) %>%
summarise(
any_rain = any(rainy)
)
# Summarise for each month, the number of days with rain
rainy_days %>%
summarise(
days_rainy = sum(any_rain)
)
## # A tibble: 12 x 2
## month days_rainy
## <ord> <int>
## 1 Jan 15
## 2 Feb 13
## 3 Mar 12
## 4 Apr 15
## 5 May 21
## 6 Jun 19
## 7 Jul 22
## 8 Aug 16
## 9 Sep 25
## 10 Oct 20
## 11 Nov 19
## 12 Dec 11
r_3_4_1 <- ymd_hms("2016-05-03 07:13:28 UTC")
# Round down to day
floor_date(r_3_4_1, unit = "day")
## [1] "2016-05-03 UTC"
# Round to nearest 5 minutes
round_date(r_3_4_1, unit = "5 minutes")
## [1] "2016-05-03 07:15:00 UTC"
# Round up to week
ceiling_date(r_3_4_1, unit = "week")
## [1] "2016-05-08 UTC"
# Subtract r_3_4_1 rounded down to day
r_3_4_1 - floor_date(r_3_4_1, unit = "day")
## Time difference of 7.224444 hours
# Create day_hour, datetime rounded down to hour
akl_hourly <- akl_hourly %>%
mutate(
day_hour = floor_date(datetime, unit = "hour")
)
# Count observations per hour
akl_hourly %>%
count(day_hour)
## # A tibble: 8,770 x 2
## day_hour n
## <dttm> <int>
## 1 2016-01-01 00:00:00 2
## 2 2016-01-01 01:00:00 2
## 3 2016-01-01 02:00:00 2
## 4 2016-01-01 03:00:00 2
## 5 2016-01-01 04:00:00 2
## 6 2016-01-01 05:00:00 2
## 7 2016-01-01 06:00:00 2
## 8 2016-01-01 07:00:00 2
## 9 2016-01-01 08:00:00 2
## 10 2016-01-01 09:00:00 2
## # ... with 8,760 more rows
# Find day_hours with n != 2
akl_hourly %>%
count(day_hour) %>%
filter(n != 2) %>%
arrange(desc(n))
## # A tibble: 92 x 2
## day_hour n
## <dttm> <int>
## 1 2016-04-03 02:00:00 4
## 2 2016-09-25 00:00:00 4
## 3 2016-06-26 09:00:00 1
## 4 2016-09-01 23:00:00 1
## 5 2016-09-02 01:00:00 1
## 6 2016-09-04 11:00:00 1
## 7 2016-09-04 16:00:00 1
## 8 2016-09-04 17:00:00 1
## 9 2016-09-05 00:00:00 1
## 10 2016-09-05 15:00:00 1
## # ... with 82 more rows
Chapter 3 - Arithmetic with Dates and Times
Taking differences of datetimes:
Time spans - difficult because they do not have a constant meaning (e.g., impact of daylight savings time):
Intervals - third option in lubridate for storing times:
Example code includes:
# The date of landing and moment of step
date_landing <- mdy("July 20, 1969")
moment_step <- mdy_hms("July 20, 1969, 02:56:15", tz = "UTC")
# How many days since the first man on the moon?
difftime(today(), date_landing, units = "days")
## Time difference of 17787 days
# How many seconds since the first man on the moon?
difftime(now(), moment_step, units = "secs")
## Time difference of 1536835289 secs
# Three dates
mar_11 <- ymd_hms("2017-03-11 12:00:00",
tz = "America/Los_Angeles")
mar_12 <- ymd_hms("2017-03-12 12:00:00",
tz = "America/Los_Angeles")
mar_13 <- ymd_hms("2017-03-13 12:00:00",
tz = "America/Los_Angeles")
# Difference between mar_13 and mar_12 in seconds
difftime(mar_13, mar_12, units = "secs")
## Time difference of 86400 secs
# Difference between mar_12 and mar_11 in seconds
difftime(mar_12, mar_11, units = "secs")
## Time difference of 82800 secs
# Add a period of one week to mon_2pm
mon_2pm <- dmy_hm("27 Aug 2018 14:00")
mon_2pm + weeks(1)
## [1] "2018-09-03 14:00:00 UTC"
# Add a duration of 81 hours to tue_9am
tue_9am <- dmy_hm("28 Aug 2018 9:00")
tue_9am + dhours(81)
## [1] "2018-08-31 18:00:00 UTC"
# Subtract a period of five years from today()
today() - years(5)
## [1] "2013-04-01"
# Subtract a duration of five years from today()
today() - dyears(5)
## [1] "2013-04-02"
# Time of North American Eclipse 2017
eclipse_2017 <- ymd_hms("2017-08-21 18:26:40")
# Duration of 29 days, 12 hours, 44 mins and 3 secs
synodic <- ddays(29) + dhours(12) + dminutes(44) + dseconds(3)
# 223 synodic months
saros <- 223 * synodic
# Add saros to eclipse_2017
eclipse_2017 + saros
## [1] "2035-09-02 02:09:49 UTC"
# Add a period of 8 hours to today
today_8am <- today() + hours(8)
# Sequence of two weeks from 1 to 26
every_two_weeks <- 1:26 * weeks(2)
# Create datetime for every two weeks for a year
today_8am + every_two_weeks
## [1] "2018-04-15 08:00:00 UTC" "2018-04-29 08:00:00 UTC"
## [3] "2018-05-13 08:00:00 UTC" "2018-05-27 08:00:00 UTC"
## [5] "2018-06-10 08:00:00 UTC" "2018-06-24 08:00:00 UTC"
## [7] "2018-07-08 08:00:00 UTC" "2018-07-22 08:00:00 UTC"
## [9] "2018-08-05 08:00:00 UTC" "2018-08-19 08:00:00 UTC"
## [11] "2018-09-02 08:00:00 UTC" "2018-09-16 08:00:00 UTC"
## [13] "2018-09-30 08:00:00 UTC" "2018-10-14 08:00:00 UTC"
## [15] "2018-10-28 08:00:00 UTC" "2018-11-11 08:00:00 UTC"
## [17] "2018-11-25 08:00:00 UTC" "2018-12-09 08:00:00 UTC"
## [19] "2018-12-23 08:00:00 UTC" "2019-01-06 08:00:00 UTC"
## [21] "2019-01-20 08:00:00 UTC" "2019-02-03 08:00:00 UTC"
## [23] "2019-02-17 08:00:00 UTC" "2019-03-03 08:00:00 UTC"
## [25] "2019-03-17 08:00:00 UTC" "2019-03-31 08:00:00 UTC"
jan_31 <- ymd("2018-01-31")
# A sequence of 1 to 12 periods of 1 month
month_seq <- 1:12 * months(1)
# Add 1 to 12 months to jan_31
jan_31 + month_seq
## [1] NA "2018-03-31" NA "2018-05-31" NA
## [6] "2018-07-31" "2018-08-31" NA "2018-10-31" NA
## [11] "2018-12-31" "2019-01-31"
# Replace + with %m+%
jan_31 %m+% month_seq
## [1] "2018-02-28" "2018-03-31" "2018-04-30" "2018-05-31" "2018-06-30"
## [6] "2018-07-31" "2018-08-31" "2018-09-30" "2018-10-31" "2018-11-30"
## [11] "2018-12-31" "2019-01-31"
# Replace + with %m-%
jan_31 %m-% month_seq
## [1] "2017-12-31" "2017-11-30" "2017-10-31" "2017-09-30" "2017-08-31"
## [6] "2017-07-31" "2017-06-30" "2017-05-31" "2017-04-30" "2017-03-31"
## [11] "2017-02-28" "2017-01-31"
# Create monarchs
mNames <- c('Elizabeth II' ,'Victoria' ,'George V' ,'George III' ,'George VI' ,'George IV' ,'Edward VII' ,'William IV' ,'Edward VIII' ,'George III(also United Kingdom)' ,'George II' ,'George I' ,'Anne' ,'Henry III' ,'Edward III' ,'Elizabeth I' ,'Henry VI' ,'Henry VI' ,'Æthelred II' ,'Æthelred II' ,'Henry VIII' ,'Charles II' ,'Henry I' ,'Henry II(co-ruler with Henry the Young King)' ,'Edward I' ,'Alfred the Great' ,'Edward the Elder' ,'Charles I' ,'Henry VII' ,'Edward the Confessor' ,'Richard II' ,'James I' ,'Edward IV' ,'Edward IV' ,'William I' ,'Edward II' ,'Cnut' ,'Stephen' ,'Stephen' ,'John' ,'Edgar I' ,'Æthelstan' ,'Henry IV' ,'William III(co-ruler with Mary II)' ,'Henry the Young King(co-ruler with Henry II)' ,'William II' ,'Richard I' ,'Eadred' ,'Henry V' ,'Edmund I' ,'Edward VI' ,'Mary II(co-ruler with William III)' ,'Mary I' ,'Anne(also Kingdom of Great Britain)' ,'Eadwig' ,'James II' ,'Edward the Martyr' ,'Harold I' ,'Harthacnut' ,'Richard III' ,'Louis (disputed)' ,'Harold II' ,'Edmund II' ,'Matilda (disputed)' ,'Edward V' ,'Edgar II' ,'Sweyn Forkbeard' ,'Jane (disputed)' ,'James VI' ,'William I' ,'Constantine II' ,'David II' ,'Alexander III' ,'Malcolm III' ,'Alexander II' ,'James I' ,'Malcolm II' ,'James V' ,'David I' ,'James III' ,'Charles II' ,'Charles II' ,'James IV' ,'Mary I' ,'Charles I' ,'Kenneth II' ,'James II' ,'Robert I' ,'Robert II' ,'Alexander I' ,'Macbeth' ,'Robert III' ,'Constantine I' ,'Kenneth MacAlpin' ,'William II' ,'Malcolm IV' ,'Giric(co-ruler with Eochaid?)' ,'Donald II' ,'Malcolm I' ,'Edgar' ,'Kenneth III' ,'Indulf' ,'Duncan I' ,'Mary II' ,'Amlaíb' ,'Anne(also Kingdom of Great Britain)' ,'Dub' ,'Cuilén' ,'Domnall mac Ailpín' ,'James VII' ,'Margaret' ,'John Balliol' ,'Donald III' ,'Constantine III' ,'Áed mac Cináeda' ,'Lulach' ,'Duncan II' ,'Ruaidrí Ua Conchobair' ,'Edward Bruce (disputed)' ,'Brian Ua Néill (disputed)' ,'Gruffudd ap Cynan' ,'Llywelyn the Great' ,'Owain Gwynedd' ,'Dafydd ab Owain Gwynedd' ,'Hywel ab Owain Gwynedd' ,'Llywelyn ap Gruffudd' ,'Owain Glyndŵr (disputed)' ,'Owain Goch ap Gruffydd' ,'Owain Lawgoch (disputed)' ,'Dafydd ap Llywelyn' ,'Dafydd ap Gruffydd')
mDominion <- c('United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'Great Britain' ,'Great Britain' ,'Great Britain' ,'Great Britain' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Ireland' ,'Ireland' ,'Ireland' ,'Gwynedd' ,'Gwynedd' ,'Gwynedd' ,'Gwynedd' ,'Gwynedd' ,'Wales' ,'Wales' ,'Wales' ,'Wales' ,'Wales' ,'Wales')
mFrom <- c('1952-02-06' ,'1837-06-20' ,'1910-05-06' ,'1801-01-01' ,'1936-12-11' ,'1820-01-29' ,'1901-01-22' ,'1830-06-26' ,'1936-01-20' ,'1760-10-25' ,'1727-06-22' ,'1714-08-01' ,'1707-05-01' ,'NA' ,'1327-01-25' ,'1558-11-17' ,'1422-08-31' ,'1470-10-31' ,'978-03-18' ,'1014-02-03' ,'1509-04-22' ,'1649-01-30' ,'1100-08-03' ,'1154-10-25' ,'1272-11-20' ,'871-04-24' ,'899-10-27' ,'1625-03-27' ,'1485-08-22' ,'1042-06-08' ,'1377-06-22' ,'1603-03-24' ,'1461-03-04' ,'1471-04-11' ,'1066-12-12' ,'1307-07-07' ,'1016-11-30' ,'1135-12-22' ,'1141-11-01' ,'1199-04-06' ,'959-10-01' ,'924-08-02' ,'1399-09-29' ,'1689-02-13' ,'1170-06-14' ,'1087-09-09' ,'1189-07-06' ,'946-05-26' ,'1413-03-21' ,'939-10-27' ,'1547-01-28' ,'1689-02-13' ,'1553-07-19' ,'1702-03-08' ,'955-11-23' ,'1685-02-06' ,'975-07-09' ,'1037-11-12' ,'1040-03-17' ,'1483-06-26' ,'1216-06-14' ,'1066-01-05' ,'1016-04-23' ,'1141-04-07' ,'1483-04-09' ,'1066-10-15' ,'1013-12-25' ,'1553-07-10' ,'1567-07-24' ,'1165-12-09' ,'900-01-01' ,'1329-06-07' ,'1249-07-06' ,'1058-03-17' ,'1214-12-04' ,'1406-04-04' ,'1005-03-25' ,'1513-09-09' ,'1124-04-23' ,'1460-08-03' ,'1649-01-30' ,'1660-05-29' ,'1488-06-11' ,'1542-12-14' ,'1625-03-27' ,'971-01-01' ,'1437-02-21' ,'1306-03-25' ,'1371-02-22' ,'1107-01-08' ,'1040-08-14' ,'1390-04-19' ,'862-01-01' ,'843-01-01' ,'1689-05-11' ,'1153-05-24' ,'878-01-01' ,'889-01-01' ,'943-01-01' ,'1097-01-01' ,'997-01-01' ,'954-01-01' ,'1034-11-25' ,'1689-04-11' ,'971-01-01' ,'1702-03-08' ,'962-01-01' ,'NA' ,'858-01-01' ,'1685-02-06' ,'1286-11-25' ,'1292-11-17' ,'1093-11-13' ,'1095-01-01' ,'877-01-01' ,'1057-08-15' ,'1094-05-01' ,'1166-01-01' ,'1315-06-01' ,'1258-01-01' ,'1081-01-01' ,'1195-01-01' ,'1137-01-01' ,'1170-01-01' ,'1170-01-01' ,'1253-01-01' ,'1400-09-16' ,'1246-02-25' ,'1372-05-01' ,'1240-04-12' ,'1282-12-11')
mTo <- c('2018-02-08' ,'1901-01-22' ,'1936-01-20' ,'1820-01-29' ,'1952-02-06' ,'1830-06-26' ,'1910-05-06' ,'1837-06-20' ,'1936-12-11' ,'1801-01-01' ,'1760-10-25' ,'1727-06-11' ,'1714-08-01' ,'1272-11-16' ,'1377-06-21' ,'1603-03-24' ,'1461-03-04' ,'1471-04-11' ,'1013-12-25' ,'1016-04-23' ,'1547-01-28' ,'1685-02-06' ,'1135-12-01' ,'1189-07-06' ,'1307-07-07' ,'899-10-26' ,'924-07-17' ,'1649-01-30' ,'1509-04-21' ,'1066-01-05' ,'1399-09-29' ,'1625-03-27' ,'1470-10-03' ,'1483-04-09' ,'1087-09-09' ,'1327-01-20' ,'1035-11-12' ,'1141-04-07' ,'1154-10-25' ,'1216-10-19' ,'975-07-08' ,'939-10-27' ,'1413-03-20' ,'1702-03-08' ,'1183-06-11' ,'1100-08-02' ,'1199-04-06' ,'955-11-23' ,'1422-08-31' ,'946-05-26' ,'1553-07-06' ,'1694-12-28' ,'1558-11-17' ,'1707-04-30' ,'959-10-01' ,'1688-12-11' ,'978-03-18' ,'1040-03-17' ,'1042-06-08' ,'1485-08-22' ,'1217-09-22' ,'1066-10-14' ,'1016-11-30' ,'1141-11-01' ,'1483-06-26' ,'1066-12-17' ,'1014-02-03' ,'1553-07-19' ,'1625-03-27' ,'1214-12-04' ,'943-01-01' ,'1371-02-22' ,'1286-03-19' ,'1093-11-13' ,'1249-07-06' ,'1437-02-21' ,'1034-11-25' ,'1542-12-14' ,'1153-05-24' ,'1488-06-11' ,'1651-09-03' ,'1685-02-06' ,'1513-09-09' ,'1567-07-24' ,'1649-01-30' ,'995-01-01' ,'1460-08-03' ,'1329-06-07' ,'1390-04-19' ,'1124-04-23' ,'1057-08-15' ,'1406-04-04' ,'877-01-01' ,'858-02-13' ,'1702-03-08' ,'1165-12-09' ,'889-01-01' ,'900-01-01' ,'954-01-01' ,'1107-01-08' ,'1005-03-25' ,'962-01-01' ,'1040-08-14' ,'1694-12-28' ,'977-01-01' ,'1707-04-30' ,'NA' ,'971-01-01' ,'862-04-13' ,'1688-12-11' ,'1290-09-26' ,'1296-07-10' ,'1097-01-01' ,'1097-01-01' ,'878-01-01' ,'1058-03-17' ,'1094-11-12' ,'1193-01-01' ,'1318-10-14' ,'1260-01-01' ,'1137-01-01' ,'1240-04-11' ,'1170-01-01' ,'1195-01-01' ,'1170-01-01' ,'1282-12-11' ,'1416-01-01' ,'1255-01-01' ,'1378-07-01' ,'1246-02-25' ,'1283-10-03')
padMDate <- function(x) {
if (is.na(x[1]) | x[1] == "NA") {
NA
} else {
paste0(c(str_pad(x[1], 4, pad="0"), x[2], x[3]), collapse="-")
}
}
monarchs <- tibble::tibble(name=mNames, dominion=mDominion,
from=ymd(sapply(str_split(mFrom, "-"), FUN=padMDate)),
to=ymd(sapply(str_split(mTo, "-"), FUN=padMDate))
)
# Print monarchs
monarchs
## # A tibble: 131 x 4
## name dominion from to
## <chr> <chr> <date> <date>
## 1 Elizabeth II United Kingdom 1952-02-06 2018-02-08
## 2 Victoria United Kingdom 1837-06-20 1901-01-22
## 3 George V United Kingdom 1910-05-06 1936-01-20
## 4 George III United Kingdom 1801-01-01 1820-01-29
## 5 George VI United Kingdom 1936-12-11 1952-02-06
## 6 George IV United Kingdom 1820-01-29 1830-06-26
## 7 Edward VII United Kingdom 1901-01-22 1910-05-06
## 8 William IV United Kingdom 1830-06-26 1837-06-20
## 9 Edward VIII United Kingdom 1936-01-20 1936-12-11
## 10 George III(also United Kingdom) Great Britain 1760-10-25 1801-01-01
## # ... with 121 more rows
# Create an interval for reign
monarchs <- monarchs %>%
mutate(reign = from %--% to)
# Find the length of reign, and arrange
monarchs %>%
mutate(length = int_length(reign)) %>%
arrange(desc(length)) %>%
select(name, length, dominion)
## # A tibble: 131 x 3
## name length dominion
## <chr> <dbl> <chr>
## 1 Elizabeth II 2083017600 United Kingdom
## 2 Victoria 2006726400 United Kingdom
## 3 James VI 1820102400 Scotland
## 4 Gruffudd ap Cynan 1767139200 Gwynedd
## 5 Edward III 1590624000 England
## 6 William I 1545868800 Scotland
## 7 Llywelyn the Great 1428796800 Gwynedd
## 8 Elizabeth I 1399507200 England
## 9 Constantine II 1356912000 Scotland
## 10 David II 1316304000 Scotland
## # ... with 121 more rows
# Print halleys
pDate <- c('66-01-26', '141-03-25', '218-04-06', '295-04-07', '374-02-13', '451-07-03', '530-11-15', '607-03-26', '684-11-26', '760-06-10', '837-02-25', '912-07-27', '989-09-02', '1066-03-25', '1145-04-19', '1222-09-10', '1301-10-22', '1378-11-09', '1456-01-08', '1531-08-26', '1607-10-27', '1682-09-15', '1758-03-13', '1835-11-16', '1910-04-20', '1986-02-09', '2061-07-28')
sDate <- c('66-01-25', '141-03-22', '218-04-06', '295-04-07', '374-02-13', '451-06-28', '530-09-27', '607-03-15', '684-10-02', '760-05-20', '837-02-25', '912-07-18', '989-09-02', '1066-01-01', '1145-04-15', '1222-09-10', '1301-10-22', '1378-11-09', '1456-01-08', '1531-08-26', '1607-10-27', '1682-09-15', '1758-03-13', '1835-08-01', '1910-04-20', '1986-02-09', '2061-07-28')
eDate <- c('66-01-26', '141-03-25', '218-05-17', '295-04-20', '374-02-16', '451-07-03', '530-11-15', '607-03-26', '684-11-26', '760-06-10', '837-02-28', '912-07-27', '989-09-05', '1066-03-25', '1145-04-19', '1222-09-28', '1301-10-31', '1378-11-14', '1456-06-09', '1531-08-26', '1607-10-27', '1682-09-15', '1758-12-25', '1835-11-16', '1910-05-20', '1986-02-09', '2061-07-28')
halleys <- tibble::tibble(perihelion_date=ymd(sapply(str_split(pDate, "-"), FUN=padMDate)),
start_date=ymd(sapply(str_split(sDate, "-"), FUN=padMDate)),
end_date=ymd(sapply(str_split(eDate, "-"), FUN=padMDate))
)
# New column for interval from start to end date
halleys <- halleys %>%
mutate(visible = start_date %--% end_date)
# The visitation of 1066
halleys_1066 <- halleys[14, ]
# Monarchs in power on perihelion date
monarchs %>%
filter(halleys_1066$perihelion_date %within% reign) %>%
select(name, from, to, dominion)
## # A tibble: 2 x 4
## name from to dominion
## <chr> <date> <date> <chr>
## 1 Harold II 1066-01-05 1066-10-14 England
## 2 Malcolm III 1058-03-17 1093-11-13 Scotland
# Monarchs whose reign overlaps visible time
monarchs %>%
filter(int_overlaps(halleys_1066$visible, reign)) %>%
select(name, from, to, dominion)
## # A tibble: 3 x 4
## name from to dominion
## <chr> <date> <date> <chr>
## 1 Edward the Confessor 1042-06-08 1066-01-05 England
## 2 Harold II 1066-01-05 1066-10-14 England
## 3 Malcolm III 1058-03-17 1093-11-13 Scotland
# New columns for duration and period
monarchs <- monarchs %>%
mutate(
duration = as.duration(reign),
period = as.period(reign))
# Examine results
monarchs %>%
select(name, duration, period) %>%
head(10) %>%
print.data.frame()
## name duration
## 1 Elizabeth II 2083017600s (~66.01 years)
## 2 Victoria 2006726400s (~63.59 years)
## 3 George V 811296000s (~25.71 years)
## 4 George III 601948800s (~19.07 years)
## 5 George VI 478224000s (~15.15 years)
## 6 George IV 328406400s (~10.41 years)
## 7 Edward VII 292982400s (~9.28 years)
## 8 William IV 220406400s (~6.98 years)
## 9 Edward VIII 28166400s (~46.57 weeks)
## 10 George III(also United Kingdom) 1268092800s (~40.18 years)
## period
## 1 66y 0m 2d 0H 0M 0S
## 2 63y 7m 2d 0H 0M 0S
## 3 25y 8m 14d 0H 0M 0S
## 4 19y 0m 28d 0H 0M 0S
## 5 15y 1m 26d 0H 0M 0S
## 6 10y 4m 28d 0H 0M 0S
## 7 9y 3m 14d 0H 0M 0S
## 8 6y 11m 25d 0H 0M 0S
## 9 10m 21d 0H 0M 0S
## 10 40y 2m 7d 0H 0M 0S
Chapter 4 - Problems in Practice
Time zones - ways to keep track of times in different locations (can pose analysis challenges):
Importing and exporting datetimes:
Wrap-up:
Example code includes:
# Game2: CAN vs NZL in Edmonton
game2 <- mdy_hm("June 11 2015 19:00")
# Game3: CHN vs NZL in Winnipeg
game3 <- mdy_hm("June 15 2015 18:30")
# Set the timezone to "America/Edmonton"
game2_local <- force_tz(game2, tzone = "America/Edmonton")
game2_local
## [1] "2015-06-11 19:00:00 MDT"
# Set the timezone to "America/Winnipeg"
game3_local <- force_tz(game3, tzone = "America/Winnipeg")
game3_local
## [1] "2015-06-15 18:30:00 CDT"
# How long does the team have to rest?
as.period(game2_local %--% game3_local)
## [1] "3d 22H 30M 0S"
# What time is game2_local in NZ?
with_tz(game2_local, tzone = "Pacific/Auckland")
## [1] "2015-06-12 13:00:00 NZST"
# What time is game2_local in Corvallis, Oregon?
with_tz(game2_local, tzone = "America/Los_Angeles")
## [1] "2015-06-11 18:00:00 PDT"
# What time is game3_local in NZ?
with_tz(game3_local, tzone = "Pacific/Auckland")
## [1] "2015-06-16 11:30:00 NZST"
# Examine datetime and date_utc columns
head(akl_hourly$datetime)
## [1] "2016-01-01 00:00:00 UTC" "2016-01-01 00:30:00 UTC"
## [3] "2016-01-01 01:00:00 UTC" "2016-01-01 01:30:00 UTC"
## [5] "2016-01-01 02:00:00 UTC" "2016-01-01 02:30:00 UTC"
head(akl_hourly$date_utc)
## [1] "2015-12-31 11:00:00 UTC" "2015-12-31 11:30:00 UTC"
## [3] "2015-12-31 12:00:00 UTC" "2015-12-31 12:30:00 UTC"
## [5] "2015-12-31 13:00:00 UTC" "2015-12-31 13:30:00 UTC"
# Force datetime to Pacific/Auckland
akl_hourly <- akl_hourly %>%
mutate(
datetime = force_tz(datetime, tzone = "Pacific/Auckland"))
# Reexamine datetime
head(akl_hourly$datetime)
## [1] "2016-01-01 00:00:00 NZDT" "2016-01-01 00:30:00 NZDT"
## [3] "2016-01-01 01:00:00 NZDT" "2016-01-01 01:30:00 NZDT"
## [5] "2016-01-01 02:00:00 NZDT" "2016-01-01 02:30:00 NZDT"
# Are datetime and date_utc the same moments
table(akl_hourly$datetime - akl_hourly$date_utc)
##
## -82800 0 3600
## 2 17450 2
# Import auckland hourly data
akl_hourly <- read_csv("./RInputFiles/akl_weather_hourly_2016.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## month = col_integer(),
## mday = col_integer(),
## time = col_time(format = ""),
## temperature = col_double(),
## weather = col_character(),
## conditions = col_character(),
## events = col_character(),
## humidity = col_integer(),
## date_utc = col_datetime(format = "")
## )
# Examine structure of time column
str(akl_hourly$time)
## Classes 'hms', 'difftime' atomic [1:17454] 0 1800 3600 5400 7200 9000 10800 12600 14400 16200 ...
## ..- attr(*, "units")= chr "secs"
# Examine head of time column
head(akl_hourly$time)
## 00:00:00
## 00:30:00
## 01:00:00
## 01:30:00
## 02:00:00
## 02:30:00
# A plot using just time
ggplot(akl_hourly, aes(x = time, y = temperature)) +
geom_line(aes(group = make_date(year, month, mday)), alpha = 0.2)
library(microbenchmark)
library(fasttime)
# Examine structure of dates
dates <- paste0(gsub(" ", "T", as.character(akl_hourly$date_utc)), "Z")
str(dates)
## chr [1:17454] "2015-12-31T11:00:00Z" "2015-12-31T11:30:00Z" ...
# Use fastPOSIXct() to parse dates
fastPOSIXct(dates) %>% str()
## POSIXct[1:17454], format: "2015-12-31 05:00:00" "2015-12-31 05:30:00" ...
# Compare speed of fastPOSIXct() to ymd_hms()
microbenchmark(
ymd_hms = ymd_hms(dates),
fasttime = fastPOSIXct(dates),
times = 20)
## Unit: milliseconds
## expr min lq mean median uq max
## ymd_hms 19.170372 21.049592 23.846711 24.186495 25.630499 29.709479
## fasttime 1.553353 1.614539 1.924144 1.658949 1.715596 6.014452
## neval cld
## 20 b
## 20 a
# Head of dates
head(dates)
## [1] "2015-12-31T11:00:00Z" "2015-12-31T11:30:00Z" "2015-12-31T12:00:00Z"
## [4] "2015-12-31T12:30:00Z" "2015-12-31T13:00:00Z" "2015-12-31T13:30:00Z"
# Parse dates with fast_strptime
fast_strptime(dates,
format = "%Y-%m-%dT%H:%M:%SZ") %>% str()
## POSIXlt[1:17454], format: "2015-12-31 11:00:00" "2015-12-31 11:30:00" ...
# Comparse speed to ymd_hms() and fasttime
microbenchmark(
ymd_hms = ymd_hms(dates),
fasttime = fastPOSIXct(dates),
fast_strptime = fast_strptime(dates,
format = "%Y-%m-%dT%H:%M:%SZ"),
times = 20)
## Unit: milliseconds
## expr min lq mean median uq
## ymd_hms 18.418764 21.458556 28.927632 23.785032 25.335422
## fasttime 1.569143 1.614342 2.259249 1.675133 1.943566
## fast_strptime 1.273473 1.296763 1.699727 1.322224 1.455059
## max neval cld
## 135.383180 20 b
## 6.997386 20 a
## 7.133576 20 a
finished <- "I finished 'Dates and Times in R' on Thursday, September 20, 2017!"
# Create a stamp based on "Sep 20 2017"
date_stamp <- stamp("September 20, 2017", orders="mdy")
## Multiple formats matched: "%Om %d, %Y"(1), "%B %d, %Y"(1)
## Using: "%B %d, %Y"
# Print date_stamp
date_stamp
## function (x, locale = "English_United States.1252")
## {
## {
## old_lc_time <- Sys.getlocale("LC_TIME")
## if (old_lc_time != locale) {
## Sys.setlocale("LC_TIME", locale)
## on.exit(Sys.setlocale("LC_TIME", old_lc_time))
## }
## }
## format(x, format = "%B %d, %Y")
## }
## <environment: 0x000000001179e8a0>
# Call date_stamp on today()
date_stamp(today())
## [1] "April 01, 2018"
# Create and call a stamp based on "09/20/2017"
stamp("09/20/2017", orders="mdy")(today())
## Multiple formats matched: "%Om/%d/%Y"(1), "%m/%d/%Y"(1)
## Using: "%Om/%d/%Y"
## [1] "04/01/2018"
# Use string finished for stamp()
stamp(finished, orders="amdy")(today())
## Multiple formats matched: "I finished 'Dates and Times in R' on %A, %B %d, %Y!"(1), "I finished 'Dates and Times in R' on %A, %Om %d, %Y!"(0)
## Using: "I finished 'Dates and Times in R' on %A, %B %d, %Y!"
## [1] "I finished 'Dates and Times in R' on Sunday, April 01, 2018!"
Chapter 1 - Working with Increasingly Large Data Sets
What is scalable data processing?:
Working with “out of core” objects using the Bigmemory Project:
References vs. Copies:
Example code includes:
# Load the microbenchmark package
library(microbenchmark)
# Compare the timings for sorting different sizes of vector
mb <- microbenchmark(
# Sort a random normal vector length 1e5
"1e5" = sort(rnorm(1e5)),
# Sort a random normal vector length 2.5e5
"2.5e5" = sort(rnorm(2.5e5)),
# Sort a random normal vector length 5e5
"5e5" = sort(rnorm(5e5)),
"7.5e5" = sort(rnorm(7.5e5)),
"1e6" = sort(rnorm(1e6)),
times = 10
)
# Plot the resulting benchmark object
plot(mb)
# Load the bigmemory package
library(bigmemory)
# Create the big.matrix object: x
x <- read.big.matrix("./RInputFiles/mortgage-sample.csv", header = TRUE,
type = "integer",
backingfile = "mortgage-sample.bin",
descriptorfile = "mortgage-sample.desc")
# Find the dimensions of x
dim(x)
## [1] 70000 16
# Attach mortgage-sample.desc
mort <- attach.big.matrix("mortgage-sample.desc")
# Find the dimensions of mort
dim(mort)
## [1] 70000 16
# Look at the first 6 rows of mort
head(mort)
## enterprise record_number msa perc_minority tract_income_ratio
## [1,] 1 566 1 1 3
## [2,] 1 116 1 3 2
## [3,] 1 239 1 2 2
## [4,] 1 62 1 2 3
## [5,] 1 106 1 2 3
## [6,] 1 759 1 3 3
## borrower_income_ratio loan_purpose federal_guarantee borrower_race
## [1,] 1 2 4 3
## [2,] 1 2 4 5
## [3,] 3 8 4 5
## [4,] 3 2 4 5
## [5,] 3 2 4 9
## [6,] 2 2 4 9
## co_borrower_race borrower_gender co_borrower_gender num_units
## [1,] 9 2 4 1
## [2,] 9 1 4 1
## [3,] 5 1 2 1
## [4,] 9 2 4 1
## [5,] 9 3 4 1
## [6,] 9 1 2 2
## affordability year type
## [1,] 3 2010 1
## [2,] 3 2008 1
## [3,] 4 2014 0
## [4,] 4 2009 1
## [5,] 4 2013 1
## [6,] 4 2010 1
# Create mort
mort <- attach.big.matrix("mortgage-sample.desc")
# Look at the first 3 rows
mort[1:3, ]
## enterprise record_number msa perc_minority tract_income_ratio
## [1,] 1 566 1 1 3
## [2,] 1 116 1 3 2
## [3,] 1 239 1 2 2
## borrower_income_ratio loan_purpose federal_guarantee borrower_race
## [1,] 1 2 4 3
## [2,] 1 2 4 5
## [3,] 3 8 4 5
## co_borrower_race borrower_gender co_borrower_gender num_units
## [1,] 9 2 4 1
## [2,] 9 1 4 1
## [3,] 5 1 2 1
## affordability year type
## [1,] 3 2010 1
## [2,] 3 2008 1
## [3,] 4 2014 0
# Create a table of the number of mortgages for each year in the data set
table(mort[, "year"])
##
## 2008 2009 2010 2011 2012 2013 2014 2015
## 8468 11101 8836 7996 10935 10216 5714 6734
a <- getLoadedDLLs()
length(a)
## [1] 39
R.utils::gcDLLs()
## named list()
a <- getLoadedDLLs()
length(a)
## [1] 39
# Load the biganalytics package (error in loading to Knit file, works OK otherwise)
library(biganalytics)
## Loading required package: foreach
## Loading required package: biglm
## Loading required package: DBI
# Get the column means of mort
colmean(mort)
## enterprise record_number msa
## 1.3814571 499.9080571 0.8943571
## perc_minority tract_income_ratio borrower_income_ratio
## 1.9701857 2.3431571 2.6898857
## loan_purpose federal_guarantee borrower_race
## 3.7670143 3.9840857 5.3572429
## co_borrower_race borrower_gender co_borrower_gender
## 7.0002714 1.4590714 3.0494857
## num_units affordability year
## 1.0398143 4.2863429 2011.2714714
## type
## 0.5300429
# Use biganalytics' summary function to get a summary of the data
summary(mort)
## min max mean NAs
## enterprise 1.0000000 2.0000000 1.3814571 0.0000000
## record_number 0.0000000 999.0000000 499.9080571 0.0000000
## msa 0.0000000 1.0000000 0.8943571 0.0000000
## perc_minority 1.0000000 9.0000000 1.9701857 0.0000000
## tract_income_ratio 1.0000000 9.0000000 2.3431571 0.0000000
## borrower_income_ratio 1.0000000 9.0000000 2.6898857 0.0000000
## loan_purpose 1.0000000 9.0000000 3.7670143 0.0000000
## federal_guarantee 1.0000000 4.0000000 3.9840857 0.0000000
## borrower_race 1.0000000 9.0000000 5.3572429 0.0000000
## co_borrower_race 1.0000000 9.0000000 7.0002714 0.0000000
## borrower_gender 1.0000000 9.0000000 1.4590714 0.0000000
## co_borrower_gender 1.0000000 9.0000000 3.0494857 0.0000000
## num_units 1.0000000 4.0000000 1.0398143 0.0000000
## affordability 0.0000000 9.0000000 4.2863429 0.0000000
## year 2008.0000000 2015.0000000 2011.2714714 0.0000000
## type 0.0000000 1.0000000 0.5300429 0.0000000
# Use deepcopy() to create first_three
first_three <- deepcopy(mort, cols = 1:3,
backingfile = "first_three.bin",
descriptorfile = "first_three.desc")
# Set first_three_2 equal to first_three
first_three_2 <- first_three
# Set the value in the first row and first column of first_three to NA
first_three[1, 1] <- NA
# Verify the change shows up in first_three_2
first_three_2[1, 1]
## [1] NA
# but not in mort
mort[1, 1]
## [1] 1
Chapter 2 - Processing and Analyzing Data with bigmemory
The Bigmemory Suite of Packages:
Split-Apply-Combine (aka Split-Compute-Combine), run in this course using split() Map() Reduce():
Visualize results using tidyverse:
Limitations of bigmemory - process is useful for dense, numeric matrices that can be stored on hard disk:
Example code includes:
library(bigtabulate)
library(tidyr)
library(ggplot2)
library(biganalytics)
library(dplyr)
race_cat <- c('Native Am', 'Asian', 'Black', 'Pacific Is', 'White', 'Two or More', 'Hispanic', 'Not Avail')
# Call bigtable to create a variable called race_table
race_table <- bigtable(mort, "borrower_race")
# Rename the elements of race_table
names(race_table) <- race_cat
race_table
## Native Am Asian Black Pacific Is White Two or More
## 143 4438 2020 195 50006 528
## Hispanic Not Avail
## 4040 8630
# Create a table of the borrower race by year
race_year_table <- bigtable(mort, c("borrower_race", "year"))
# Convert rydf to a data frame
rydf <- as.data.frame(race_year_table)
# Create the new column Race
rydf$Race <- race_cat
# Let's see what it looks like
rydf
## 2008 2009 2010 2011 2012 2013 2014 2015 Race
## 1 11 18 13 16 15 12 29 29 Native Am
## 2 384 583 603 568 770 673 369 488 Asian
## 3 363 320 209 204 258 312 185 169 Black
## 4 33 38 21 13 28 22 17 23 Pacific Is
## 5 5552 7739 6301 5746 8192 7535 4110 4831 White
## 6 43 85 65 58 89 78 46 64 Two or More
## 7 577 563 384 378 574 613 439 512 Hispanic
## 9 1505 1755 1240 1013 1009 971 519 618 Not Avail
female_residence_prop <- function(x, rows) {
x_subset <- x[rows, ]
# Find the proporation of female borrowers in urban areas
prop_female_urban <- sum(x_subset[, "borrower_gender"] == 2 &
x_subset[, "msa"] == 1) /
sum(x_subset[, "msa"] == 1)
# Find the proporation of female borrowers in rural areas
prop_female_rural <- sum(x_subset[, "borrower_gender"] == 2 &
x_subset[, "msa"] == 0) /
sum(x_subset[, "msa"] == 0)
c(prop_female_urban, prop_female_rural)
}
# Find the proportion of female borrowers in 2015
female_residence_prop(mort, mort[, "year"] == 2015)
## [1] 0.2737439 0.2304965
# Split the row numbers of the mortage data by year
spl <- split(1:nrow(mort), mort[, "year"])
# Call str on spl
str(spl)
## List of 8
## $ 2008: int [1:8468] 2 8 15 17 18 28 35 40 42 47 ...
## $ 2009: int [1:11101] 4 13 25 31 43 49 52 56 67 68 ...
## $ 2010: int [1:8836] 1 6 7 10 21 23 24 27 29 38 ...
## $ 2011: int [1:7996] 11 20 37 46 53 57 73 83 86 87 ...
## $ 2012: int [1:10935] 14 16 26 30 32 33 48 69 81 94 ...
## $ 2013: int [1:10216] 5 9 19 22 36 44 55 58 72 74 ...
## $ 2014: int [1:5714] 3 12 50 60 64 66 103 114 122 130 ...
## $ 2015: int [1:6734] 34 41 54 61 62 65 82 91 102 135 ...
# For each of the row splits, find the female residence proportion
all_years <- Map(function(rows) female_residence_prop(mort, rows), spl)
# Call str on all_years
str(all_years)
## List of 8
## $ 2008: num [1:2] 0.275 0.204
## $ 2009: num [1:2] 0.244 0.2
## $ 2010: num [1:2] 0.241 0.201
## $ 2011: num [1:2] 0.252 0.241
## $ 2012: num [1:2] 0.244 0.21
## $ 2013: num [1:2] 0.275 0.257
## $ 2014: num [1:2] 0.289 0.268
## $ 2015: num [1:2] 0.274 0.23
# Collect the results as rows in a matrix
prop_female <- Reduce(rbind, all_years)
# Rename the row and column names
dimnames(prop_female) <- list(names(all_years), c("prop_female_urban", "prop_femal_rural"))
# View the matrix
prop_female
## prop_female_urban prop_femal_rural
## 2008 0.2748514 0.2039474
## 2009 0.2441074 0.2002978
## 2010 0.2413881 0.2014028
## 2011 0.2520644 0.2408931
## 2012 0.2438950 0.2101313
## 2013 0.2751059 0.2567164
## 2014 0.2886756 0.2678571
## 2015 0.2737439 0.2304965
# Convert prop_female to a data frame
prop_female_df <- as.data.frame(prop_female)
# Add a new column Year
prop_female_df$Year <- row.names(prop_female_df)
# Call gather on prop_female_df
prop_female_long <- gather(prop_female_df, Region, Prop, -Year)
# Create a line plot
ggplot(prop_female_long, aes(x = Year, y = Prop, group = Region, color = Region)) +
geom_line()
# Call summary on mort
summary(mort)
## min max mean NAs
## enterprise 1.0000000 2.0000000 1.3814571 0.0000000
## record_number 0.0000000 999.0000000 499.9080571 0.0000000
## msa 0.0000000 1.0000000 0.8943571 0.0000000
## perc_minority 1.0000000 9.0000000 1.9701857 0.0000000
## tract_income_ratio 1.0000000 9.0000000 2.3431571 0.0000000
## borrower_income_ratio 1.0000000 9.0000000 2.6898857 0.0000000
## loan_purpose 1.0000000 9.0000000 3.7670143 0.0000000
## federal_guarantee 1.0000000 4.0000000 3.9840857 0.0000000
## borrower_race 1.0000000 9.0000000 5.3572429 0.0000000
## co_borrower_race 1.0000000 9.0000000 7.0002714 0.0000000
## borrower_gender 1.0000000 9.0000000 1.4590714 0.0000000
## co_borrower_gender 1.0000000 9.0000000 3.0494857 0.0000000
## num_units 1.0000000 4.0000000 1.0398143 0.0000000
## affordability 0.0000000 9.0000000 4.2863429 0.0000000
## year 2008.0000000 2015.0000000 2011.2714714 0.0000000
## type 0.0000000 1.0000000 0.5300429 0.0000000
bir_df_wide <- bigtable(mort, c("borrower_income_ratio", "year")) %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
filter(rowname %in% c(1, 2, 3)) %>%
select(-rowname) %>%
# Create a new column called BIR with the corresponding table categories
mutate(BIR = c(">=0,<=50%", ">50, <=80%", ">80%"))
bir_df_wide
## 2008 2009 2010 2011 2012 2013 2014 2015 BIR
## 1 1205 1473 600 620 745 725 401 380 >=0,<=50%
## 2 2095 2791 1554 1421 1819 1861 1032 1145 >50, <=80%
## 3 4844 6707 6609 5934 8338 7559 4255 5169 >80%
bir_df_wide %>%
# Transform the wide-formatted data.frame into the long format
gather(Year, Count, -BIR) %>%
# Use ggplot to create a line plot
ggplot(aes(x = Year, y = Count, group = BIR, color = BIR)) +
geom_line()
Chapter 3 - Working with iotools
Introduction to chunk-wise processing - solution to challenges from bigmemory:
First look at iotools: Importing data:
Using chunk.apply - effectively moves away from what is functionally a “for loop” to allow better parallel processing:
Example code includes:
foldable_range <- function(x) {
if (is.list(x)) {
# If x is a list then reduce it by the min and max of each element in the list
c(Reduce(min, x), Reduce(max, x))
} else {
# Otherwise, assume it's a vector and find it's range
range(x)
}
}
# Verify that foldable_range() works on the record_number column
foldable_range(mort[, "record_number"])
## [1] 0 999
# Split the mortgage data by year
spl <- split(1:nrow(mort), mort[, "year"])
# Use foldable_range() to get the range of the record numbers
foldable_range(Map(function(s) foldable_range(mort[s, "record_number"]), spl))
## [1] 0 999
# Load the iotools and microbenchmark packages
library(iotools)
library(microbenchmark)
# Time the reading of files
microbenchmark(
# Time the reading of a file using read.delim five times
read.delim("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ","),
# Time the reading of a file using read.delim.raw five times
read.delim.raw("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ","),
times = 5
)
## Unit: milliseconds
## expr
## read.delim("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ",")
## read.delim.raw("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ",")
## min lq mean median uq max neval cld
## 261.33058 279.79356 328.89131 292.14537 405.4698 405.7173 5 b
## 62.72106 72.04946 78.82342 72.12565 76.6886 110.5323 5 a
# Read mortgage-sample.csv as a raw vector
raw_file_content <- readAsRaw("./RInputFiles/mortgage-sample.csv")
# Convert the raw vector contents to a matrix
mort_mat <- mstrsplit(raw_file_content, sep = ",", type = "integer", skip = 1)
# Look at the first 6 rows
head(mort_mat)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 1 566 1 1 3 1 2 4 3 9 2 4 1
## [2,] 1 116 1 3 2 1 2 4 5 9 1 4 1
## [3,] 1 239 1 2 2 3 8 4 5 5 1 2 1
## [4,] 1 62 1 2 3 3 2 4 5 9 2 4 1
## [5,] 1 106 1 2 3 3 2 4 9 9 3 4 1
## [6,] 1 759 1 3 3 2 2 4 9 9 1 2 2
## [,14] [,15] [,16]
## [1,] 3 2010 1
## [2,] 3 2008 1
## [3,] 4 2014 0
## [4,] 4 2009 1
## [5,] 4 2013 1
## [6,] 4 2010 1
# Convert the raw file contents to a data.frame
mort_df <- dstrsplit(raw_file_content, sep = ",", col_types = rep("integer", 16), skip = 1)
# Look at the first 6 rows
head(mort_df)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1 1 566 1 1 3 1 2 4 3 9 2 4 1 3 2010 1
## 2 1 116 1 3 2 1 2 4 5 9 1 4 1 3 2008 1
## 3 1 239 1 2 2 3 8 4 5 5 1 2 1 4 2014 0
## 4 1 62 1 2 3 3 2 4 5 9 2 4 1 4 2009 1
## 5 1 106 1 2 3 3 2 4 9 9 3 4 1 4 2013 1
## 6 1 759 1 3 3 2 2 4 9 9 1 2 2 4 2010 1
# We have created a file connection fc to the "mortgage-sample.csv" file and read in the first line to get rid of the header.
# Define the function to apply to each chunk
make_table <- function(chunk) {
# Read each chunk as a matrix
x <- mstrsplit(chunk, type = "integer", sep = ",")
# Create a table of the number of borrowers (column 3) for each chunk
table(x[, 3])
}
# Create a file connection to mortgage-sample.csv
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
# Read the first line to get rid of the header
(col_names <- readLines(fc, n = 1))
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
(col_names <- lapply(str_split(col_names, '\\",\\"'), FUN=function(x) { str_replace(x, '\\"', '') })[[1]])
## [1] "enterprise" "record_number"
## [3] "msa" "perc_minority"
## [5] "tract_income_ratio" "borrower_income_ratio"
## [7] "loan_purpose" "federal_guarantee"
## [9] "borrower_race" "co_borrower_race"
## [11] "borrower_gender" "co_borrower_gender"
## [13] "num_units" "affordability"
## [15] "year" "type"
# Read the data in chunks
counts <- chunk.apply(fc, make_table, CH.MAX.SIZE = 1e5)
# Close the file connection
close(fc)
# Print counts
counts
## 0 1
## [1,] 309 2401
## [2,] 289 2422
## [3,] 266 2444
## [4,] 300 2410
## [5,] 279 2431
## [6,] 310 2400
## [7,] 274 2436
## [8,] 283 2428
## [9,] 259 2452
## [10,] 287 2423
## [11,] 288 2423
## [12,] 283 2428
## [13,] 271 2439
## [14,] 299 2411
## [15,] 294 2416
## [16,] 305 2405
## [17,] 280 2431
## [18,] 275 2435
## [19,] 303 2407
## [20,] 279 2431
## [21,] 296 2414
## [22,] 294 2417
## [23,] 288 2424
## [24,] 264 2446
## [25,] 292 2418
## [26,] 228 2013
# Sum up the chunks
colSums(counts)
## 0 1
## 7395 62605
msa_map <- c("rural", "urban")
# Define the function to apply to each chunk
make_msa_table <- function(chunk) {
# Read each chunk as a data frame
x <- dstrsplit(chunk, col_types = rep("integer", length(col_names)), sep = ",")
# Set the column names of the data frame that's been read
colnames(x) <- col_names
# Create new column, msa_pretty, with a string description of where the borrower lives
x$msa_pretty <- msa_map[x$msa + 1]
# Create a table from the msa_pretty column
table(x$msa_pretty)
}
# Create a file connection to mortgage-sample.csv
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
# Read the first line to get rid of the header
readLines(fc, n = 1)
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
# Read the data in chunks
counts <- chunk.apply(fc, make_msa_table, CH.MAX.SIZE = 1e5)
# Close the file connection
close(fc)
# Aggregate the counts as before
colSums(counts)
## rural urban
## 7395 62605
iotools_read_fun <- function(parallel) {
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
readLines(fc, n = 1)
chunk.apply(fc, make_msa_table,
CH.MAX.SIZE = 1e5, parallel = parallel)
close(fc)
}
# Benchmark the new function
microbenchmark(
# Use one process
iotools_read_fun(1),
# Use three processes
iotools_read_fun(3),
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max
## iotools_read_fun(1) 103.3186 107.7917 116.5193 113.8281 120.0514 159.1224
## iotools_read_fun(3) 102.9925 104.2279 114.2783 111.4085 122.0897 144.9311
## neval cld
## 20 a
## 20 a
Chapter 4 - Case Study: Preliminary Analysis of Housing Data
Overview of types of analysis for this chapter:
Are the data missing at random?
Analyzing the Housing Data:
Borrower Lending Trends: City vs. Rural:
Wrap up:
Example code includes:
# Create a table of borrower_race column
race_table <- bigtable(mort, "borrower_race")
# Rename the elements
names(race_table) <- race_cat[as.numeric(names(race_table))]
# Find the proportion
race_table[1:7] / sum(race_table[1:7])
## Native Am Asian Black Pacific Is White Two or More
## 0.002330129 0.072315464 0.032915105 0.003177448 0.814828092 0.008603552
## Hispanic
## 0.065830210
mort_names <- col_names
# Create table of the borrower_race
race_table_chunks <- chunk.apply(
"./RInputFiles/mortgage-sample.csv", function(chunk) {
x <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(x) <- mort_names
table(x[, "borrower_race"])
}, CH.MAX.SIZE = 1e5)
# Add up the columns
race_table <- colSums(race_table_chunks)
# Find the proportion
borrower_proportion <- race_table[1:7] / sum(race_table[1:7])
pop_proportion <- c(0.009, 0.048, 0.126, 0.002, 0.724, 0.029, 0.163)
names(pop_proportion) <- race_cat[1:7]
# Create the matrix
matrix(c(pop_proportion, borrower_proportion), byrow = TRUE, nrow = 2,
dimnames = list(c("Population Proportion", "Borrower Proportion"), race_cat[1:7]))
## Native Am Asian Black Pacific Is
## Population Proportion 0.009000000 0.04800000 0.12600000 0.002000000
## Borrower Proportion 0.002330129 0.07231546 0.03291511 0.003177448
## White Two or More Hispanic
## Population Proportion 0.7240000 0.029000000 0.16300000
## Borrower Proportion 0.8148281 0.008603552 0.06583021
# Create a variable indicating if borrower_race is missing in the mortgage data
borrower_race_ind <- mort[, "borrower_race"] == 9
# Create a factor variable indicating the affordability
affordability_factor <- factor(mort[, "affordability"])
# Perform a logistic regression
summary(glm(borrower_race_ind ~ affordability_factor, family = binomial))
##
## Call:
## glm(formula = borrower_race_ind ~ affordability_factor, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5969 -0.5016 -0.5016 -0.5016 2.0867
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7478 0.1376 -12.701 <2e-16 ***
## affordability_factor1 -0.2241 0.1536 -1.459 0.1447
## affordability_factor2 -0.3090 0.1609 -1.920 0.0548 .
## affordability_factor3 -0.2094 0.1446 -1.448 0.1476
## affordability_factor4 -0.2619 0.1383 -1.894 0.0582 .
## affordability_factor9 0.1131 0.1413 0.800 0.4235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 52279 on 69999 degrees of freedom
## Residual deviance: 52166 on 69994 degrees of freedom
## AIC: 52178
##
## Number of Fisher Scoring iterations: 4
# Open a connection to the file and skip the header
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
readLines(fc, n = 1)
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
# Create a function to read chunks
make_table <- function(chunk) {
# Create a matrix
m <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(m) <- mort_names
# Create the output table
bigtable(m, c("borrower_race", "year"))
}
# Import data using chunk.apply
race_year_table <- chunk.apply(fc, make_table)
## Warning: closing unused connection 5 (./RInputFiles/mortgage-sample.csv)
# Close connection
close(fc)
# Cast it to a data frame
rydf <- as.data.frame(race_year_table)
# Create a new column Race with race/ethnicity
rydf$Race <- race_cat
# Note: We removed the row corresponding to "Not Avail".
# View rydf
rydf <-
rydf %>%
filter(Race !="Not Avail")
rydf
## 2008 2009 2010 2011 2012 2013 2014 2015 Race
## 1 11 18 13 16 15 12 29 29 Native Am
## 2 384 583 603 568 770 673 369 488 Asian
## 3 363 320 209 204 258 312 185 169 Black
## 4 33 38 21 13 28 22 17 23 Pacific Is
## 5 5552 7739 6301 5746 8192 7535 4110 4831 White
## 6 43 85 65 58 89 78 46 64 Two or More
## 7 577 563 384 378 574 613 439 512 Hispanic
# View pop_proportion
pop_proportion
## Native Am Asian Black Pacific Is White Two or More
## 0.009 0.048 0.126 0.002 0.724 0.029
## Hispanic
## 0.163
# Gather on all variables except Race
rydfl <- gather(rydf, Year, Count, -Race)
# Create a new adjusted count variable
rydfl$Adjusted_Count <- rydfl$Count / pop_proportion[rydfl$Race]
# Plot
ggplot(rydfl, aes(x = Year, y = Adjusted_Count, group = Race, color = Race)) +
geom_line()
# View rydf
rydf
## 2008 2009 2010 2011 2012 2013 2014 2015 Race
## 1 11 18 13 16 15 12 29 29 Native Am
## 2 384 583 603 568 770 673 369 488 Asian
## 3 363 320 209 204 258 312 185 169 Black
## 4 33 38 21 13 28 22 17 23 Pacific Is
## 5 5552 7739 6301 5746 8192 7535 4110 4831 White
## 6 43 85 65 58 89 78 46 64 Two or More
## 7 577 563 384 378 574 613 439 512 Hispanic
# Normalize the columns
for (i in seq_len(nrow(rydf))) {
rydf[i, 1:8] <- rydf[i, 1:8] / rydf[i, 1]
}
# Convert the data to long format
rydf_long <- gather(rydf, Year, Proportion, -Race)
# Plot
ggplot(rydf_long, aes(x = Year, y = Proportion, group = Race, color = Race)) +
geom_line()
# Open a connection to the file and skip the header
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
readLines(fc, n = 1)
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
# Create a function to read chunks
make_table <- function(chunk) {
# Create a matrix
m <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(m) <- mort_names
# Create the output table
bigtable(m, c("msa", "year"))
}
# Import data using chunk.apply
msa_year_table <- chunk.apply(fc, make_table)
# Close connection
close(fc)
# Convert to a data frame
df_msa <- as.data.frame(msa_year_table)
# Rename columns
df_msa$MSA <- c("rural", "city")
# Gather on all columns except Year
df_msa_long <- gather(df_msa, Year, Count, -MSA)
# Plot
ggplot(df_msa_long, aes(x = Year, y = Count, group = MSA, color = MSA)) +
geom_line()
# Tabulate borrower_income_ratio and federal_guarantee
ir_by_fg <- bigtable(mort, c("borrower_income_ratio", "federal_guarantee"))
# Label the columns and rows of the table
income_cat <- c('0 <= 50', '50 < 80', '> 80', 'Not Applicable')
guarantee_cat <- c('FHA/VA', 'RHS', 'HECM', 'No Guarantee')
dimnames(ir_by_fg) <- list(income_cat, guarantee_cat)
# For each row in ir_by_fg, divide by the sum of the row
for (i in seq_len(nrow(ir_by_fg))) {
ir_by_fg[i, ] = ir_by_fg[i, ] / sum(ir_by_fg[i, ])
}
# Print
ir_by_fg
## FHA/VA RHS HECM No Guarantee
## 0 <= 50 0.008944544 0.0014636526 0.0443974630 0.9451943
## 50 < 80 0.005977548 0.0024055985 0.0026971862 0.9889197
## > 80 0.001113022 0.0002428412 0.0006475766 0.9979966
## Not Applicable 0.023676880 0.0013927577 0.0487465181 0.9261838
# Quirky fix so that the files can be used again later
rm(mort)
rm(x)
rm(first_three)
rm(first_three_2)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1885818 100.8 3205452 171.2 3205452 171.2
## Vcells 12911266 98.6 20742801 158.3 17951103 137.0
Chapter 1 - Downloading Files and Using API Clients
Introduction: Working with Web Data in R:
Understanding Application Programming Interfaces (API) - automatically handling data changes:
Access tokens and API:
Example code includes:
# Here are the URLs! As you can see they're just normal strings
csv_url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1561/datasets/chickwts.csv"
tsv_url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_3026/datasets/tsv_data.tsv"
# Read a file in from the CSV URL and assign it to csv_data
csv_data <- read.csv(csv_url)
# Read a file in from the TSV URL and assign it to tsv_data
tsv_data <- read.delim(tsv_url)
# Examine the objects with head()
head(csv_data)
## weight feed
## 1 179 horsebean
## 2 160 horsebean
## 3 136 horsebean
## 4 227 horsebean
## 5 217 horsebean
## 6 168 horsebean
head(tsv_data)
## weight feed
## 1 179 horsebean
## 2 160 horsebean
## 3 136 horsebean
## 4 227 horsebean
## 5 217 horsebean
## 6 168 horsebean
# Download the file with download.file()
download.file(url = csv_url, destfile = "./RInputFiles/feed_data.csv")
# Read it in with read.csv()
csv_data <- read.csv("./RInputFiles/feed_data.csv")
# Add a new column: square_weight
csv_data$square_weight <- csv_data$weight ** 2
# Save it to disk with saveRDS()
saveRDS(csv_data, "./RInputFiles/modified_feed_data.RDS")
# Read it back in with readRDS()
modified_feed_data <- readRDS("./RInputFiles/modified_feed_data.RDS")
# Examine modified_feed_data
str(modified_feed_data)
## 'data.frame': 71 obs. of 3 variables:
## $ weight : int 179 160 136 227 217 168 108 124 143 140 ...
## $ feed : Factor w/ 6 levels "casein","horsebean",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ square_weight: num 32041 25600 18496 51529 47089 ...
# Load pageviews
# library(pageviews)
# Get the pageviews for "Hadley Wickham"
hadley_pageviews <- pageviews::article_pageviews(project = "en.wikipedia", "Hadley Wickham")
# Examine the resulting object
str(hadley_pageviews)
## 'data.frame': 1 obs. of 8 variables:
## $ project : chr "wikipedia"
## $ language : chr "en"
## $ article : chr "Hadley_Wickham"
## $ access : chr "all-access"
## $ agent : chr "all-agents"
## $ granularity: chr "daily"
## $ date : POSIXct, format: "2015-10-01"
## $ views : num 53
# Load birdnik
# library(birdnik)
# Get the word frequency for "vector", using api_key to access it
# vector_frequency <- word_frequency(api_key, "vector")
Chapter 2 - Using httr to interact with API Directly
GET and POST requests in theory - https and web requests in theory:
Graceful httr - code that responds appropriately and constructs its own url:
Respectful API Usage - usage that works for the API owners as well as the clients:
Example code includes:
# Load the httr package
library(httr)
# Make a GET request to http://httpbin.org/get
get_result <- GET("http://httpbin.org/get")
# Print it to inspect it
# get_result
# Make a POST request to http://httpbin.org/post with the body "this is a test"
# post_result <- POST(url="http://httpbin.org/post", body="this is a test")
# Print it to inspect it
# post_result
url <- "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/Hadley_Wickham/daily/20170101/20170102"
# Make a GET request to url and save the results
pageview_response <- GET(url)
# Call content() to retrieve the data the server sent back
pageview_data <- content(pageview_response)
# Examine the results with str()
str(pageview_data)
## List of 1
## $ items:List of 2
## ..$ :List of 7
## .. ..$ project : chr "en.wikipedia"
## .. ..$ article : chr "Hadley_Wickham"
## .. ..$ granularity: chr "daily"
## .. ..$ timestamp : chr "2017010100"
## .. ..$ access : chr "all-access"
## .. ..$ agent : chr "all-agents"
## .. ..$ views : int 45
## ..$ :List of 7
## .. ..$ project : chr "en.wikipedia"
## .. ..$ article : chr "Hadley_Wickham"
## .. ..$ granularity: chr "daily"
## .. ..$ timestamp : chr "2017010200"
## .. ..$ access : chr "all-access"
## .. ..$ agent : chr "all-agents"
## .. ..$ views : int 86
fake_url <- "http://google.com/fakepagethatdoesnotexist"
# Make the GET request
request_result <- GET(fake_url)
# Check request_result
if(http_error(request_result)){
warning("The request failed")
} else {
content(request_result)
}
## Warning: The request failed
# Construct a directory-based API URL to `http://swapi.co/api`,
# looking for person `1` in `people`
directory_url <- paste("http://swapi.co/api", "people", 1, sep = "/")
# Make a GET call with it
result <- GET(directory_url)
# Create list with nationality and country elements
query_params <- list(nationality = "americans",
country = "antigua")
# Make parameter-based call to httpbin, with query_params
parameter_response <- GET("https://httpbin.org/get", query = query_params)
# Print parameter_response
parameter_response
## Response [https://httpbin.org/get?nationality=americans&country=antigua]
## Date: 2018-02-16 13:02
## Status: 200
## Content-Type: application/json
## Size: 425 B
## {
## "args": {
## "country": "antigua",
## "nationality": "americans"
## },
## "headers": {
## "Accept": "application/json, text/xml, application/xml, */*",
## "Accept-Encoding": "gzip, deflate",
## "Connection": "close",
## "Host": "httpbin.org",
## ...
# Do not change the url
# url <- "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/Aaron_Halfaker/daily/2015100100/2015103100"
# Add the email address and the test sentence inside user_agent()
# server_response <- GET(url, user_agent("my@email.address this is a test"))
# Construct a vector of 2 URLs
urls <- c("http://fakeurl.com/api/1.0/", "http://fakeurl.com/api/2.0/")
for(url in urls){
# Send a GET request to url
result <- GET(url)
# Delay for 5 seconds between requests
Sys.sleep(1)
}
get_pageviews <- function(article_title){
url <- paste0("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents", article_title, "daily/2015100100/2015103100", sep = "/")
response <- GET(url, user_agent("my@email.com this is a test"))
if(http_error(response)){
stop("the request failed" )
} else {
result <- content(response)
return(result)
}
}
Chapter 3 - Handling JSON and XML
JSON is a dictionary-like format (plain text) foe sending data on the internet:
Manipulating JSON - lists are the natural R hierarchy for JSON:
XML Structure - plain text like JSON, but with a very different structure:
XPATH - language for specifying nodes in an XML document:
Example code includes:
rev_history <- function(title, format = "json"){
if (title != "Hadley Wickham") {
stop('rev_history() only works for `title = "Hadley Wickham"`')
}
if (format == "json"){
resp <- readRDS("had_rev_json.rds")
} else if (format == "xml"){
resp <- readRDS("had_rev_xml.rds")
} else {
stop('Invalid format supplied, try "json" or "xml"')
}
resp
}
test_json <- "{\"continue\":{\"rvcontinue\":\"20150528042700|664370232\",\"continue\":\"||\"},\"query\":{\"pages\":{\"41916270\":{\"pageid\":41916270,\"ns\":0,\"title\":\"Hadley Wickham\",\"revisions\":[{\"user\":\"214.28.226.251\",\"anon\":\"\",\"timestamp\":\"2015-01-14T17:12:45Z\",\"comment\":\"\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Mary Helen Wickham III''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"73.183.151.193\",\"anon\":\"\",\"timestamp\":\"2015-01-15T15:49:34Z\",\"comment\":\"\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"FeanorStar7\",\"timestamp\":\"2015-01-24T16:34:31Z\",\"comment\":\"/* External links */ add LCCN and cats\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"KasparBot\",\"timestamp\":\"2015-04-26T19:18:17Z\",\"comment\":\"authority control moved to wikidata\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"Spkal\",\"timestamp\":\"2015-05-06T18:24:57Z\",\"comment\":\"/* Bibliography */ Added his new book, R Packages\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"}]}}}}"
# Get revision history for "Hadley Wickham"
resp_json <- rev_history("Hadley Wickham")
# Check http_type() of resp_json
http_type(resp_json)
# Examine returned text with content()
content(resp_json, as="text")
# Parse response with content()
content(resp_json, as="parsed")
# Parse returned text with fromJSON()
library(jsonlite)
fromJSON(content(resp_json, as="text"))
# Load rlist
library(rlist)
# Examine output of this code
str(content(resp_json), max.level = 4)
# Store revision list
revs <- content(resp_json)$query$pages$`41916270`$revisions
# Extract the user element
user_time <- list.select(revs, user, timestamp)
# Print user_time
user_time
# Stack to turn into a data frame
list.stack(user_time)
# Load dplyr
library(dplyr)
# Pull out revision list
revs <- content(resp_json)$query$pages$`41916270`$revisions
# Extract user and timestamp
revs %>%
bind_rows() %>%
select(user, timestamp)
# Load xml2
library(xml2)
# Get XML revision history
resp_xml <- rev_history("Hadley Wickham", format = "xml")
# Check response is XML
http_type(resp_xml)
# Examine returned text with content()
rev_text <- content(resp_xml, as="text")
rev_text
# Turn rev_text into an XML document
rev_xml <- read_xml(rev_text)
# Examine the structure of rev_xml
str(rev_xml)
# Load xml2
library(xml2)
# Get XML revision history
resp_xml <- rev_history("Hadley Wickham", format = "xml")
# Check response is XML
http_type(resp_xml)
# Examine returned text with content()
rev_text <- content(resp_xml, as="text")
rev_text
# Turn rev_text into an XML document
rev_xml <- read_xml(rev_text)
# Examine the structure of rev_xml
xml_structure(rev_xml)
# Find all nodes using XPATH "/api/query/pages/page/revisions/rev"
xml_find_all(rev_xml, "/api/query/pages/page/revisions/rev")
# Find all rev nodes anywhere in document
rev_nodes <- xml_find_all(rev_xml, "//rev")
# Use xml_text() to get text from rev_nodes
xml_text(rev_nodes)
# All rev nodes
rev_nodes <- xml_find_all(rev_xml, "//rev")
# The first rev node
first_rev_node <- xml_find_first(rev_xml, "//rev")
# Find all attributes with xml_attrs()
xml_attrs(first_rev_node)
# Find user attribute with xml_attr()
xml_attr(first_rev_node, attr="user")
# Find user attribute for all rev nodes
xml_attr(rev_nodes, attr="user")
# Find anon attribute for all rev nodes
xml_attr(rev_nodes, attr="anon")
get_revision_history <- function(article_title){
# Get raw revision response
rev_resp <- rev_history(article_title, format = "xml")
# Turn the content() of rev_resp into XML
rev_xml <- read_xml(content(rev_resp, "text"))
# Find revision nodes
rev_nodes <- xml_find_all(rev_xml, "//rev")
# Parse out usernames
user <- xml_attr(rev_nodes, attr="user")
# Parse out timestamps
timestamp <- readr::parse_datetime(xml_attr(rev_nodes, "timestamp"))
# Parse out content
content <- xml_text(rev_nodes)
# Return data frame
data.frame(user = user,
timestamp = timestamp,
content = substr(content, 1, 40))
}
# Call function for "Hadley Wickham"
get_revision_history(article_title = "Hadley Wickham")
Chapter 4 - Web Scraping with XPATH
Web scraping 101 - sometimes a website does not have an API, so a different approach is required:
HTML structure - basically, content within tags, much like XML:
This is a test
requests that “This is a test” be available in paragraph formReformatting data (especially to a rectangular format such as a data frame):
Example code includes:
# Load rvest
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
# Hadley Wickham's Wikipedia page
test_url <- "https://en.wikipedia.org/wiki/Hadley_Wickham"
# Read the URL stored as "test_url" with read_html()
test_xml <- read_html(test_url)
# Print test_xml
test_xml
## {xml_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-sub ...
test_node_xpath <- "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"vcard\", \" \" ))]"
# Use html_node() to grab the node with the XPATH stored as `test_node_xpath`
node <- html_node(x = test_xml, xpath = test_node_xpath)
# Print the first element of the result
node[1]
## $node
## <pointer: 0x000000000b91bb80>
# The first thing we'll grab is a name, from the first element of the previously extracted table (now stored as table_element)
table_element <- node
# Extract the name of table_element
element_name <- html_name(table_element)
# Print the name
element_name
## [1] "table"
second_xpath_val <- "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"fn\", \" \" ))]"
# Extract the element of table_element referred to by second_xpath_val and store it as page_name
page_name <- html_node(x = table_element, xpath = second_xpath_val)
# Extract the text from page_name
page_title <- html_text(page_name)
# Print page_title
page_title
## [1] "Hadley Wickham"
# Turn table_element into a data frame and assign it to wiki_table
wiki_table <- html_table(table_element)
# Print wiki_table
wiki_table
## Hadley Wickham
## 1
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## 11
## Hadley Wickham
## 1
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
## 11
# Rename the columns of wiki_table
colnames(wiki_table) <- c("key", "value")
# Remove the empty row from wiki_table
cleaned_table <- subset(wiki_table, !(key == ""))
# Print cleaned_table
cleaned_table
## key
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## value
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
Chapter 5 - CSS Web Scraping and Final Case Study
CSS (cascading style sheets) web scraping in theory:
Final case study: Introduction:
Wrap up:
Example code includes:
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
# Hadley Wickham's Wikipedia page
test_url <- "https://en.wikipedia.org/wiki/Hadley_Wickham"
# Read the URL stored as "test_url" with read_html()
test_xml <- read_html(test_url)
# Print test_xml
test_xml
## {xml_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-sub ...
# Select the table elements
html_nodes(test_xml, css = "table")
## {xml_nodeset (2)}
## [1] <table class="infobox biography vcard" style="width:22em">\n<tr>\n<t ...
## [2] <table class="nowraplinks hlist navbox-inner" style="border-spacing: ...
# Select elements with class = "infobox"
html_nodes(test_xml, css = ".infobox")
## {xml_nodeset (1)}
## [1] <table class="infobox biography vcard" style="width:22em">\n<tr>\n<t ...
# Select elements with id = "firstHeading"
html_nodes(test_xml, css = "#firstHeading")
## {xml_nodeset (1)}
## [1] <h1 id="firstHeading" class="firstHeading" lang="en">Hadley Wickham< ...
# Extract element with class infobox
infobox_element <- html_nodes(test_xml, css = ".infobox")
# Get tag name of infobox_element
element_name <- html_name(infobox_element)
# Print element_name
element_name
## [1] "table"
# Extract element with class fn
page_name <- html_node(x = infobox_element, css=".fn")
# Get contents of page_name
page_title <- html_text(page_name)
# Print page_title
page_title
## [1] "Hadley Wickham"
# Load httr
library(httr)
# The API url
base_url <- "https://en.wikipedia.org/w/api.php"
# Set query parameters
query_params <- list(action="parse",
page="Hadley Wickham",
format="xml")
# Get data from API
resp <- GET(url = "https://en.wikipedia.org/w/api.php", query = query_params)
# Parse response
resp_xml <- content(resp)
# Load rvest
library(rvest)
# Read page contents as HTML
page_html <- read_html(xml_text(resp_xml))
# Extract infobox element
infobox_element <- html_node(page_html, css=".infobox")
# Extract page name element from infobox
page_name <- html_node(infobox_element, css=".fn")
# Extract page name as text
page_title <- html_text(page_name)
# Your code from earlier exercises
wiki_table <- html_table(infobox_element)
colnames(wiki_table) <- c("key", "value")
cleaned_table <- subset(wiki_table, !key == "")
# Create a dataframe for full name
name_df <- data.frame(key = "Full name", value = page_title)
# Combine name_df with cleaned_table
wiki_table2 <- rbind(name_df, cleaned_table)
# Print wiki_table
wiki_table2
## key
## 1 Full name
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## value
## 1 Hadley Wickham
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
library(httr)
library(rvest)
library(xml2)
get_infobox <- function(title){
base_url <- "https://en.wikipedia.org/w/api.php"
# Change "Hadley Wickham" to title
query_params <- list(action = "parse",
page = title,
format = "xml")
resp <- GET(url = base_url, query = query_params)
resp_xml <- content(resp)
page_html <- read_html(xml_text(resp_xml))
infobox_element <- html_node(x = page_html, css =".infobox")
page_name <- html_node(x = infobox_element, css = ".fn")
page_title <- html_text(page_name)
wiki_table <- html_table(infobox_element)
colnames(wiki_table) <- c("key", "value")
cleaned_table <- subset(wiki_table, !wiki_table$key == "")
name_df <- data.frame(key = "Full name", value = page_title)
wiki_table <- rbind(name_df, cleaned_table)
wiki_table
}
# Test get_infobox with "Hadley Wickham"
get_infobox(title = "Hadley Wickham")
## key
## 1 Full name
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## value
## 1 Hadley Wickham
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
# Try get_infobox with "Ross Ihaka"
get_infobox(title = "Ross Ihaka")
## key
## 1 Full name
## 2 Ihaka at the 2010 New Zealand Open Source Awards
## 3 Residence
## 4 Alma mater
## 5 Known for
## 6 Awards
## 7 Scientific career
## 8 Fields
## 9 Institutions
## 10 Thesis
## 11 Doctoral advisor
## value
## 1 Ross Ihaka
## 2 Ihaka at the 2010 New Zealand Open Source Awards
## 3 Auckland, New Zealand
## 4 University of AucklandUniversity of California, Berkeley
## 5 R programming language
## 6 Pickering Medal (2008)
## 7 Scientific career
## 8 Statistical Computing
## 9 University of Auckland
## 10 Ruaumoko (1985)
## 11 David R. Brillinger
# Try get_infobox with "Grace Hopper"
get_infobox(title = "Grace Hopper")
## key
## 1 Full name
## 2 Rear Admiral Grace M. Hopper, 1984
## 3 Nickname(s)
## 4 Born
## 5 Died
## 6 Place of burial
## 7 Allegiance
## 8 Service/branch
## 9 Years of service
## 10 Rank
## 11 Awards
## value
## 1 Grace Murray Hopper
## 2 Rear Admiral Grace M. Hopper, 1984
## 3 "Amazing Grace"
## 4 (1906-12-09)December 9, 1906\nNew York City, New York, U.S.
## 5 January 1, 1992(1992-01-01) (aged 85)Arlington, Virginia, U.S.
## 6 Arlington National Cemetery
## 7 United States of America
## 8 United States Navy
## 9 1943–1966, 1967–1971, 1972–1986
## 10 Rear admiral (lower half)
## 11 Defense Distinguished Service Medal Legion of Merit Meritorious Service Medal American Campaign Medal World War II Victory Medal National Defense Service Medal Armed Forces Reserve Medal with two Hourglass Devices Naval Reserve Medal Presidential Medal of Freedom (posthumous)
Chapter 1 - Basic plotting with lattice
Introduction - general objectives:
Optional arguments:
Box and whisker plots and reordering elements:
Example code includes:
data(airquality)
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# Load the lattice package
library(lattice)
# Create the histogram
histogram(~ Ozone, data = airquality)
# Create the histogram
histogram(~ Ozone, data = airquality,
# Specify number of bins
nint = 15,
# Specify quantity displayed on y-axis
type = "count")
# Create the scatter plot
xyplot(Ozone ~ Solar.R, data = airquality)
# Create scatterplot
xyplot(Ozone ~ Temp, data = airquality,
# Add main label
main = "Environmental conditions in New York City (1973)",
# Add axis labels
ylab = "Ozone (ppb)",
xlab = "Temperature (Fahrenheit)")
# Create a density plot
densityplot(~ Ozone, data = airquality,
# Choose how raw data is shown
plot.points = "jitter")
data(USCancerRates, package="latticeExtra")
str(USCancerRates)
## 'data.frame': 3041 obs. of 8 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female: num 124 103 161 157 151 ...
## $ UCL95.female: num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
rn_USCR <- row.names(USCancerRates)
# Create reordered variable
library(dplyr)
USCancerRates <-
mutate(USCancerRates,
state.ordered = reorder(state, rate.female, median, na.rm = TRUE)
)
# Create box and whisker plot
bwplot(state.ordered ~ rate.female, data = USCancerRates)
# Create box and whisker plot
bwplot(state.ordered ~ rate.female, data = USCancerRates,
# Change whiskers extent
coef = 0)
Chapter 2 - Conditioning and the Formula Interface
Conditioning - identify sources of variability in the data by examining sub-groups:
Data summary and transformation - grouping:
Incorporating external data sources:
The trellis object - lattice creates trellis objects rather than directly creating plots (as in base R):
Example code includes:
# The airquality dataset has been pre-loaded
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# Create a histogram
histogram(~ Ozone | factor(Month),
data = airquality,
# Define the layout
layout=c(2, 3),
# Change the x-axis label
xlab="Ozone (ppb)")
# USCancerRates has been pre-loaded
str(USCancerRates)
## 'data.frame': 3041 obs. of 9 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female : num 124 103 161 157 151 ...
## $ UCL95.female : num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
## $ state.ordered: Factor w/ 49 levels "Utah","New Mexico",..: 25 25 25 25 25 25 25 25 25 25 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
# Create a density plot
densityplot(~ rate.male + rate.female,
data = USCancerRates,
outer = TRUE,
# Suppress data points
plot.points = FALSE,
# Add a reference line
ref=TRUE)
# Create a density plot
densityplot(~ rate.male + rate.female,
data = USCancerRates,
# Set value of 'outer'
outer=FALSE,
# Add x-axis label
xlab="Rate (per 100,000)",
# Add a legend
auto.key=TRUE,
plot.points = FALSE,
ref = TRUE)
xyplot(Ozone ~ Temp, airquality, groups = Month,
# Complete the legend spec
auto.key = list(space = "right",
title = "Month",
text = month.name[5:9]))
USCancerRates <- USCancerRates %>%
mutate(division=state.division[match(state, state.name)])
# Create 'division.ordered' by reordering levels
USCancerRates <-
mutate(USCancerRates,
division.ordered = reorder(division,
rate.male + rate.female,
mean, na.rm = TRUE))
# Create conditioned scatter plot
xyplot(rate.female ~ rate.male | division.ordered,
data = USCancerRates,
# Add reference grid
grid = TRUE,
# Add reference line
abline = c(0, 1))
# Levels of division.ordered
levels(USCancerRates$division.ordered)
## [1] "Mountain" "West North Central" "Pacific"
## [4] "Middle Atlantic" "New England" "East North Central"
## [7] "West South Central" "South Atlantic" "East South Central"
# Specify the as.table argument
xyplot(rate.female ~ rate.male | division.ordered,
data = USCancerRates,
grid = TRUE, abline = c(0, 1),
as.table=TRUE)
# Create box-and-whisker plot
bwplot(division.ordered ~ rate.male + rate.female,
data = USCancerRates,
outer = TRUE,
# Add a label for the x-axis
xlab="Rate (per 100,000)",
# Add strip labels
strip = strip.custom(factor.levels = c("Male", "Female")))
# Create "trellis" object
tplot <-
densityplot(~ rate.male + rate.female | division.ordered,
data = USCancerRates, outer = TRUE,
plot.points = FALSE, as.table = TRUE)
# Change names for the second dimension
dimnames(tplot)[[2]] <- c("Male", "Female")
# Update x-axis label and plot
update(tplot, xlab = "Rate")
# Create "trellis" object
tplot <-
densityplot(~ rate.male + rate.female | division.ordered,
data = USCancerRates, outer = TRUE,
plot.points = FALSE, as.table = TRUE)
# Inspect dimension
dim(tplot)
## [1] 9 2
dimnames(tplot)
## $division.ordered
## [1] "Mountain" "West North Central" "Pacific"
## [4] "Middle Atlantic" "New England" "East North Central"
## [7] "West South Central" "South Atlantic" "East South Central"
##
## [[2]]
## [1] "rate.male" "rate.female"
# Select subset retaining only last three divisions
tplot[7:9, ]
Chapter 3 - Controlling scales and graphical parameters
Combining scales:
Logarithmic scales:
Graphical parameters:
Using simpleTheme():
Example code includes:
# The lattice package and the USMortality dataset have been pre-loaded.
Status <- factor(c('Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural'), levels=c("Rural", "Urban")
)
Sex <- factor(c('Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female'), levels=c("Female", "Male")
)
Cause <- factor(c('Heart disease', 'Heart disease', 'Heart disease', 'Heart disease', 'Cancer', 'Cancer', 'Cancer', 'Cancer', 'Lower respiratory', 'Lower respiratory', 'Lower respiratory', 'Lower respiratory', 'Unintentional injuries', 'Unintentional injuries', 'Unintentional injuries', 'Unintentional injuries', 'Cerebrovascular diseases', 'Cerebrovascular diseases', 'Cerebrovascular diseases', 'Cerebrovascular diseases', 'Alzheimers', 'Alzheimers', 'Alzheimers', 'Alzheimers', 'Diabetes', 'Diabetes', 'Diabetes', 'Diabetes', 'Flu and pneumonia', 'Flu and pneumonia', 'Flu and pneumonia', 'Flu and pneumonia', 'Suicide', 'Suicide', 'Suicide', 'Suicide', 'Nephritis', 'Nephritis', 'Nephritis', 'Nephritis'),
levels=c('Alzheimers', 'Cancer', 'Cerebrovascular diseases', 'Diabetes', 'Flu and pneumonia', 'Heart disease', 'Lower respiratory', 'Nephritis', 'Suicide', 'Unintentional injuries')
)
Rate <- c(210.2, 242.7, 132.5, 154.9, 195.9, 219.3, 140.2, 150.8, 44.5, 62.8, 36.5, 46.9, 49.6, 71.3, 24.7, 37.2, 36.1, 42.2, 34.9, 42.2, 19.4, 21.8, 25.5, 30.6, 24.9, 29.5, 17.1, 21.8, 17.7, 20.8, 12.9, 16.3, 19.2, 26.3, 5.3, 6.2, 15.7, 18.3, 10.7, 13.9)
SE <- c(0.2, 0.6, 0.2, 0.4, 0.2, 0.5, 0.2, 0.4, 0.1, 0.3, 0.1, 0.2, 0.1, 0.3, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.1, 0.1, 0.2, 0, 0.1, 0.1, 0.2, 0, 0.1)
USMortality <- data.frame(Status=Status, Sex=Sex, Cause=Cause, Rate=Rate, SE=SE)
# Specify upper bound to exclude Heart disease and Cancer
x_limits <- c(0, 100)
# Draw the plot
dotplot(Cause ~ Rate | Sex + Status, data = USMortality, as.table = TRUE,
xlim = x_limits)
dotplot(Cause ~ Rate | Sex + Status, data = USMortality,
as.table = TRUE,
scales = list(x = list(relation = "free",
# Specify limits for each panel
limits = list(c(0, 50), c(0, 80),
c(0, 50), c(0, 80) ))))
dotplot(Cause ~ Rate | Sex + Status, data = USMortality,
as.table = TRUE,
# Change the number of tick marks
scales = list(x = list(tick.number = 10,
# Show `Rate` labels on both bottom and top
alternating = 3,
# Rotate `Rate` labels by 90 degrees
rot = 90),
# Rotate `Cause` labels by 45 degrees
y = list(rot = 45)))
# Define at as 2^3 up to 2^8
x_ticks_at <- 2 ** (3:8)
dotplot(Cause ~ Rate | Sex, data = USMortality,
groups = Status, auto.key = list(columns = 2),
scales = list(x = list(log = 2,
# A numeric vector with
# values 2^3, 2^4, ..., 2^8
at = x_ticks_at,
# A character vector,
# "8" for 2^3, "16" for 2^4, etc.
labels = x_ticks_at)))
# Create the dot plot
dotplot(Cause ~ Rate | Status, data = USMortality,
groups = Sex, auto.key = list(columns = 2),
scales = list(x = list(log = TRUE,
equispaced.log = FALSE)),
# Provide pch values for the two groups
pch = c(3, 1))
dotplot(Cause ~ Rate | Status, data = USMortality,
groups = Sex, auto.key = list(columns = 2),
par.settings = simpleTheme(pch = c(3, 1)),
scales = list(x = list(log = 2, equispaced.log = FALSE)))
# The WorldPhones matrix is already provided, with the first row removed so you only need consider consecutive years
data(WorldPhones)
WorldPhones <- WorldPhones[row.names(WorldPhones) != 1951, ]
WorldPhones
## N.Amer Europe Asia S.Amer Oceania Africa Mid.Amer
## 1956 60423 29990 4708 2568 2366 1411 733
## 1957 64721 32510 5230 2695 2526 1546 773
## 1958 68484 35218 6662 2845 2691 1663 836
## 1959 71799 37598 6856 3000 2868 1769 911
## 1960 76036 40341 8220 3145 3054 1905 1008
## 1961 79831 43173 9053 3338 3224 2005 1076
names(dimnames(WorldPhones)) <- c("Year", "Region")
# Transform matrix data to data frame
WorldPhonesDF <- as.data.frame(
# Intermediate step: convert to table
as.table(WorldPhones),
responseName = "Phones")
# Create the dot plot
dotplot(Year ~ Phones | Region,
data = WorldPhonesDF,
as.table = TRUE,
# Log-transform the x-axis
scales = list(x = list(log = TRUE,
equispaced.log = FALSE,
# Set x-axis relation to "sliced"
relation = "sliced")))
# Load latticeExtra package for ggplot2like()
library(latticeExtra)
## Loading required package: RColorBrewer
##
## Attaching package: 'latticeExtra'
## The following object is masked _by_ '.GlobalEnv':
##
## USCancerRates
## The following object is masked from 'package:ggplot2':
##
## layer
# Transform matrix data to data frame
names(dimnames(WorldPhones)) <- c("Year", "Region")
WorldPhonesDF <-
as.data.frame(as.table(WorldPhones[-1, ]),
responseName = "Phones")
# Create the dot plot
dotplot(Year ~ Phones | Region,
data = WorldPhonesDF,
as.table = TRUE,
scales = list(x = list(log = TRUE,
equispaced.log = FALSE,
relation = "sliced")),
# Fill in suitable value of par.settings
par.settings = ggplot2like(),
# Fill in suitable value of lattice.options
lattice.options = ggplot2like.opts())
# Create factor variable
airquality$Month.Name <-
factor(airquality$Month, levels = 1:12,
labels = month.name[1:12])
# Create histogram of Ozone, conditioning on Month
histogram(~ Ozone | Month.Name,
data = airquality, as.table = TRUE,
# Set border to be transparent
border = "transparent",
# Set fill color to be mid-gray
col = "grey50")
# Create factor variable
airquality$Month.Name <-
factor(airquality$Month, levels = 1:12,
labels = month.name)
levels(airquality$Month.Name)
## [1] "January" "February" "March" "April" "May"
## [6] "June" "July" "August" "September" "October"
## [11] "November" "December"
# Drop empty levels
airquality$Month.Name <- droplevels(airquality$Month.Name)
levels(airquality$Month.Name)
## [1] "May" "June" "July" "August" "September"
# Obtain colors from RColorBrewer
library(RColorBrewer)
my.colors <- brewer.pal(n = 5, name = "Set1")
# Density plot of ozone concentration grouped by month
densityplot(~ Ozone, data = airquality, groups = Month.Name,
plot.points = FALSE,
auto.key = list(space = "right"),
# Fill in value of col
par.settings = simpleTheme(col = my.colors,
# Fill in value of lwd
lwd = 2))
Chapter 4 - Customizing plots using panel functions
Panel functions:
Prepanel Functions to control limits:
Optional arguments of default panel functions:
Example code includes:
panel.xyrug <- function(x, y, ...)
{
# Reproduce standard scatter plot
panel.xyplot(x, y, ...)
# Identify observations with x-value missing
x.missing <- is.na(x)
# Identify observations with y-value missing
y.missing <- is.na(y)
# Draw rugs along axes
panel.rug(x = x[y.missing], y = y[x.missing])
}
airquality$Month.Name <-
factor(month.name[airquality$Month], levels = month.name)
xyplot(Ozone ~ Solar.R | Month.Name, data = airquality,
panel = panel.xyrug, as.table = TRUE)
# Create factor variable with month names
airquality$Month.Name <-
factor(month.name[airquality$Month], levels = month.name)
# Create box-and-whisker plot
bwplot(Month.Name ~ Ozone + Temp, airquality,
# Specify outer
outer=TRUE,
# Specify x-axis relation
scales = list(x = list(relation="free")),
# Specify layout
layout=c(2, 1),
# Specify x-axis label
xlab="Measured value")
# Create violin plot
bwplot(Month.Name ~ Ozone + Temp, airquality,
# Specify outer
outer = TRUE,
# Specify x-axis relation
scales = list(x = list(relation="free")),
# Specify layout
layout=c(2, 1),
# Specify x-axis label
xlab="Measured value",
# Replace default panel function
panel = panel.violin)
# Create panel function
panel.ss <- function(x, y, ...) {
# Call panel.smoothScatter()
panel.smoothScatter(x, y, ...)
# Call panel.loess()
panel.loess(x, y, col = "red")
# Call panel.abline()
panel.abline(0, 1)
}
# Create plot
xyplot(rate.female ~ rate.male, data = USCancerRates,
panel = panel.ss,
main = "County-wise deaths due to cancer")
## (loaded the KernSmooth namespace)
# Define prepanel function
prepanel.histdens.2 <- function(x, ...) {
h <- prepanel.default.histogram(x, ...)
d <- density(x, na.rm = TRUE)
list(xlim = quantile(x, c(0.005, 0.995), na.rm = TRUE),
# Calculate upper y-limit
ylim = c(0, max(d$y, h$ylim[2])))
}
panel.histdens <- function(x, ...) {
panel.histogram(x, ...)
panel.lines(density(x, na.rm = TRUE))
}
# Create a histogram of rate.male and rate.female
histogram(~ rate.male + rate.female,
data = USCancerRates, outer = TRUE,
type = "density", nint = 50,
border = "transparent", col = "lightblue",
# The panel function: panel.histdens
panel = panel.histdens,
# The prepanel function: prepanel.histdens.2
prepanel = prepanel.histdens.2,
# Ensure that the x-axis is log-transformed
# and has relation "sliced"
scales = list(x = list(log = TRUE,
equispaced.log = FALSE,
relation = "sliced")),
xlab = "Rate (per 100,000)")
# Create the box and whisker plot
bwplot(division.ordered ~ rate.male,
data = USCancerRates,
# Indicate median by line instead of dot
pch = "|",
# Include notches for confidence interval
notch = TRUE,
# The x-axis should plot log-transformed values
scales = list(x = list(log=TRUE, equispaced.log=FALSE)),
xlab = "Death Rate in Males (per 100,000)")
# Load the 'latticeExtra' package
library(latticeExtra)
# Create summary dataset
USCancerRates.state <-
with(USCancerRates, {
rmale <- tapply(rate.male, state, median, na.rm = TRUE)
rfemale <- tapply(rate.female, state, median, na.rm = TRUE)
data.frame(Rate = c(rmale, rfemale),
State = rep(names(rmale), 2),
Gender = rep(c("Male", "Female"),
each = length(rmale)))
})
# Reorder levels
library(dplyr)
USCancerRates.state <-
mutate(USCancerRates.state, State = reorder(State, Rate))
head(USCancerRates.state)
## Rate State Gender
## 1 286.00 Alabama Male
## 2 237.95 Alaska Male
## 3 209.30 Arizona Male
## 4 284.10 Arkansas Male
## 5 221.30 California Male
## 6 204.40 Colorado Male
# URLs for emojis
emoji.man <- "https://twemoji.maxcdn.com/72x72/1f468.png"
emoji.woman <- "https://twemoji.maxcdn.com/72x72/1f469.png"
# Create dotplot
# dotplot(State ~ Rate, data = USCancerRates.state,
# Specify grouping variable
# groups = Gender,
# Specify panel function
# panel = panel.xyimage,
# Specify emoji URLs
# pch = c(emoji.woman, emoji.man),
# Make symbols smaller
# cex = 0.75)
Chapter 5 - Extensions and the lattice ecosystem
New methods - lattice is used by many packages because it is highly extensible:
New high-level functions can be created:
Manipulation (extension) of trellis objects:
Example code includes:
# Use 'EuStockMarkets' time series data
data(EuStockMarkets)
str(EuStockMarkets)
## Time-Series [1:1860, 1:4] from 1991 to 1999: 1629 1614 1607 1621 1618 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "DAX" "SMI" "CAC" "FTSE"
# Create time series plot
xyplot(EuStockMarkets,
# Plot all series together
superpose = TRUE,
# Split up the time axis into parts
cut = list(number = 3, overlap = 0.25))
# Create time series plot
xyplot(EuStockMarkets,
# Specify panel function
panel=panel.horizonplot,
# Specify prepanel function
prepanel=prepanel.horizonplot)
# Load required packages
library(maps)
# Create map object for US counties
county.map <- map("county", plot = FALSE, fill = TRUE,
# Specify projection
projection = "sinusoidal")
# Create choropleth map
row.names(USCancerRates) <- rn_USCR
mapplot(row.names(USCancerRates) ~ log10(rate.male) + log10(rate.female),
data = USCancerRates,
xlab = "", scales = list(draw = FALSE),
# Specify map
map = county.map)
## Warning in (function (x, y, map, breaks, colramp, exact = FALSE,
## lwd = 0.5, : 64 unmatched regions: alaska,nome, alaska,wade hampton,
## alaska,haines, alaska,....
## Warning in (function (x, y, map, breaks, colramp, exact = FALSE,
## lwd = 0.5, : 64 unmatched regions: alaska,nome, alaska,wade hampton,
## alaska,haines, alaska,....
# Create subset for Louisiana
LACancerRates1 <- filter(USCancerRates, state == "Louisiana")
str(LACancerRates1)
## 'data.frame': 64 obs. of 11 variables:
## $ rate.male : num 369 361 349 338 338 ...
## $ LCL95.male : num 316 289 302 308 303 ...
## $ UCL95.male : num 428 446 402 372 376 ...
## $ rate.female : num 162 193 215 194 192 ...
## $ LCL95.female : num 134 150 184 176 170 ...
## $ UCL95.female : num 196 246 250 215 218 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ county :Class 'AsIs' chr [1:64] "Richland Parish" "Madison Parish" "De Soto Parish" "St. Bernard Parish" ...
## $ state.ordered : Factor w/ 49 levels "Utah","New Mexico",..: 46 46 46 46 46 46 46 46 46 46 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ division : Factor w/ 9 levels "New England",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ division.ordered: Factor w/ 9 levels "Mountain","West North Central",..: 7 7 7 7 7 7 7 7 7 7 ...
## ..- attr(*, "scores")= num [1:9(1d)] 417 416 446 466 433 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "New England" "Middle Atlantic" "South Atlantic" "East South Central" ...
# Reorder levels of county
LACancerRates2 <-
mutate(LACancerRates1,
county = reorder(county, rate.male))
# Draw confidence intervals
segplot(county ~ LCL95.male + UCL95.male,
data = LACancerRates2,
# Add point estimates
centers = rate.male,
# Draw segments rather than bands
draw.bands = FALSE)
# The 'USCancerRates' dataset
str(USCancerRates)
## 'data.frame': 3041 obs. of 11 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female : num 124 103 161 157 151 ...
## $ UCL95.female : num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
## $ state.ordered : Factor w/ 49 levels "Utah","New Mexico",..: 25 25 25 25 25 25 25 25 25 25 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ division : Factor w/ 9 levels "New England",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ division.ordered: Factor w/ 9 levels "Mountain","West North Central",..: 9 9 9 9 9 9 9 9 9 9 ...
## ..- attr(*, "scores")= num [1:9(1d)] 417 416 446 466 433 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "New England" "Middle Atlantic" "South Atlantic" "East South Central" ...
# Load the 'hexbin' package
library(hexbin)
# Create hexbin plot
hexbinplot(rate.female ~ rate.male,
data = USCancerRates,
# Add a regression line
type = "r",
# function to transform counts
trans = sqrt,
# function to invert transformed counts
inv = function(x) x^2
)
# Load the 'directlabels' package
library(directlabels)
# Use the 'airquality' dataset
str(airquality)
## 'data.frame': 153 obs. of 7 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R : int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Month.Name: Factor w/ 12 levels "January","February",..: 5 5 5 5 5 5 5 5 5 5 ...
# Create factor variable
airquality$Month.Name <-
factor(month.name[airquality$Month], levels = month.name)
# Create density plot object
tplot2 <-
densityplot(~ Ozone + Temp, data = airquality,
# Variables should go in different panels
outer = TRUE,
# Specify grouping variable
groups = Month.Name,
# Suppress display of data points
plot.points = FALSE,
# Add reference line
ref = TRUE,
# Specify layout
layout = c(2, 1),
# Omit strip labels
strip = FALSE,
# Provide column-specific x-axis labels
xlab = c("Ozone (ppb)", "Temperature (F)"),
# Let panels have independent scales
scales = list(relation="free"))
# Produce plot with direct labels
direct.label(tplot2)
# 'USCancerRates' is pre-loaded
str(USCancerRates)
## 'data.frame': 3041 obs. of 11 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female : num 124 103 161 157 151 ...
## $ UCL95.female : num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
## $ state.ordered : Factor w/ 49 levels "Utah","New Mexico",..: 25 25 25 25 25 25 25 25 25 25 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ division : Factor w/ 9 levels "New England",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ division.ordered: Factor w/ 9 levels "Mountain","West North Central",..: 9 9 9 9 9 9 9 9 9 9 ...
## ..- attr(*, "scores")= num [1:9(1d)] 417 416 446 466 433 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "New England" "Middle Atlantic" "South Atlantic" "East South Central" ...
# Create scatter plot
p <- xyplot(rate.female ~ rate.male, data = USCancerRates,
# Change plotting character
pch = 16,
# Make points semi-transparent
alpha = 0.25)
# Create layer with reference grid
l0 <- layer_(panel.grid())
# Create layer with reference line
l1 <- layer(panel.abline(0, 1))
# Create layer with regression fit
l2 <- layer(panel.smoother(x, y, method="lm"))
# Combine and plot
p + l0 + l1 + l2
Chapter 1 - R Time Series Visualization Tools
Refresher on xts and the plot() function:
Other useful visualizing functions:
Example code includes:
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
# data is a 504x4 xts object of Yahoo, Microsoft, Citigroup, and Dow
tmpData <- readr::read_delim("./RInputFiles/dataset_1_1.csv", delim=" ")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## yahoo = col_double(),
## microsoft = col_double(),
## citigroup = col_double(),
## dow_chemical = col_double()
## )
data <- xts::xts(tmpData[, -1], order.by=as.POSIXct(tmpData$Index))
# Display the first few lines of the data
head(data)
## yahoo microsoft citigroup dow_chemical
## 2015-01-01 18:00:00 50.17 44.30501 53.45259 42.48209
## 2015-01-04 18:00:00 49.13 43.89759 51.76803 41.16821
## 2015-01-05 18:00:00 49.21 43.25329 49.94556 40.50662
## 2015-01-06 18:00:00 48.59 43.80284 50.40857 40.44139
## 2015-01-07 18:00:00 50.23 45.09144 51.16711 41.44776
## 2015-01-08 18:00:00 49.72 44.71244 50.02437 41.38253
# Display the column names of the data
colnames(data)
## [1] "yahoo" "microsoft" "citigroup" "dow_chemical"
# Plot yahoo data and add title
plot(data[, "yahoo"], main="yahoo")
# Replot yahoo data with labels for X and Y axes
plot(data[, "yahoo"], main="yahoo", xlab="date", ylab="price")
# Note that type="h" is for bars
# Plot the second time series and change title
plot(data[, 2], main="microsoft")
# Replot with same title, add subtitle, use bars
plot(data[, 2], main="microsoft", sub="Daily closing price since 2015", type="h")
# Change line color to red
lines(data[, 2], col="red")
# Plot two charts on same graphical window
par(mfrow = c(2, 1))
plot(data[, 1], main="yahoo")
plot(data[, 2], main="microsoft")
# Replot with reduced margin and character sizes
par(mfrow = c(2, 1), mex=0.6, cex=0.8)
plot(data[, 1], main="yahoo")
plot(data[, 2], main="microsoft")
par(mfrow = c(1, 1), mex=1, cex=1)
# Plot the "microsoft" series
plot(data[, "microsoft"], main="Stock prices since 2015")
# Add the "dow_chemical" series in red
lines(data[, "dow_chemical"], col="red")
# Add a Y axis on the right side of the chart
axis(side=4, at=pretty(data[, "dow_chemical"]))
# Add a legend in the bottom right corner
legend("bottomright", legend=c("microsoft", "dow_chemical"), col=c("black", "red"), lty=c(1, 1))
# Plot the "citigroup" time series
plot(data[, "citigroup"], main="Citigroup")
# Create vert_line to identify January 4th, 2016 in citigroup
vert_line <- which(index(data[, "citigroup"]) == as.POSIXct("2016-01-04"))
# Add a red vertical line using vert_line
abline(v = .index(data[, "citigroup"])[vert_line], col = "red")
# Create hori_line to identify average price of citigroup
hori_line <- mean(data[, "citigroup"])
# Add a blue horizontal line using hori_line
abline(h = hori_line, col = "blue")
# Create period to hold the 3 months of 2015
period <- c("2015-01/2015-03")
# Highlight the first three months of 2015
PerformanceAnalytics::chart.TimeSeries(data[, "citigroup"], period.areas=period)
# Highlight the first three months of 2015 in light grey
PerformanceAnalytics::chart.TimeSeries(data[, "citigroup"], period.areas=period, period.color="lightgrey")
# Plot the microsoft series
plot(data[, "microsoft"], main="Dividend date and amount")
# Add the citigroup series
lines(data[, "citigroup"], col="orange", lwd=2)
# Add a new y axis for the citigroup series
axis(side=4, at=pretty(data[, "citigroup"]), col="orange")
micro_div_date <- "15 Nov. 2016"
citi_div_date <- "13 Nov. 2016"
micro_div_value <- "$0.39"
citi_div_value <- "$0.16"
# Same plot as the previous exercise
plot(data$microsoft, main = "Dividend date and amount")
lines(data$citigroup, col = "orange", lwd = 2)
axis(side = 4, at = pretty(data$citigroup), col = "orange")
# Create the two legend strings
micro <- paste0("Microsoft div. of ", micro_div_value," on ", micro_div_date)
citi <- paste0("Citigroup div. of ", citi_div_value," on ", citi_div_date)
# Create the legend in the bottom right corner
legend(x = "bottomright", legend = c(micro, citi), col = c("black", "orange"), lty = c(1, 1))
data_1_1_old <- data
Chapter 2 - Univariate Time Series
Univariate time series analysis - deals with only a single variable:
Other visualization tools:
Combining everything so far:
Example code includes:
tmpData <- readr::read_delim("./RInputFiles/dataset_2_1.csv", delim=" ")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## Apple = col_double()
## )
names(tmpData) <- c("Index", "apple")
data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot Apple's stock price
plot(data[, "apple"], main="Apple stock price")
# Create a time series called rtn
rtn <- TTR::ROC(data[, "apple"])
# Plot Apple daily price and daily returns
par(mfrow=c(1, 2))
plot(data[, "apple"], main="Apple stock price")
plot(rtn)
par(mfrow=c(1, 1))
dim(rtn)
## [1] 522 1
rtn <- rtn[complete.cases(rtn), ]
dim(rtn)
## [1] 521 1
# Create a histogram of Apple stock returns
hist(rtn, main="Apple stock return distribution", probability=TRUE)
# Add a density line
lines(density(rtn[complete.cases(rtn), ]))
# Redraw a thicker, red density line
lines(density(rtn[complete.cases(rtn), ]), col="red", lwd=2)
rtnRaw <- as.double(rtn$apple)
# Draw box and whisker plot for the Apple returns
boxplot(rtnRaw)
# Draw a box and whisker plot of a normal distribution
boxplot(rnorm(1000))
# Redraw both plots on the same graphical window
par(mfrow=c(2, 1))
boxplot(rtnRaw, horizontal=TRUE)
boxplot(rnorm(1000), horizontal=TRUE)
par(mfrow=c(1, 1))
# Draw autocorrelation plot
acf(rtn, main="Apple return autocorrelation")
# Redraw with a maximum lag of 10
acf(rtn, main="Apple return autocorrelation", lag.max=10)
# Create q-q plot
qqnorm(rtn, main="Apple return QQ-plot")
# Add a red line showing normality
qqline(rtn, col="red")
par(mfrow=c(2, 2))
hist(rtn, probability=TRUE)
lines(density(rtn), col="red")
boxplot(rtnRaw)
acf(rtn)
qqnorm(rtn)
qqline(rtn, col="red")
par(mfrow=c(1, 1))
Chapter 3 - Multivariate Time Series
Dealing with higher dimensions - visualization challenges with larger numbers of series:
Multivariate time series:
Higher dimension time series:
Example code includes:
# You are provided with a dataset (portfolio) containing the weigths of stocks A (stocka) and B (stockb) in your portfolio for each month in 2016
stockA <- c(0.1, 0.4, 0.5, 0.5, 0.2, 0.3, 0.7, 0.8, 0.7, 0.2, 0.1, 0.2)
stockB <- c(0.9, 0.6, 0.5, 0.5, 0.8, 0.7, 0.3, 0.2, 0.3, 0.8, 0.9, 0.8)
pDates <- as.Date(c('2016-01-01', '2016-02-01', '2016-03-01', '2016-04-01', '2016-05-01', '2016-06-01', '2016-07-01', '2016-08-01', '2016-09-01', '2016-10-01', '2016-11-01', '2016-12-01'))
portfolio <- xts(data.frame(stocka=stockA, stockb=stockB), order.by=pDates)
# Plot stacked barplot
barplot(portfolio)
# Plot grouped barplot
barplot(portfolio, beside=TRUE)
tmpData <- readr::read_delim("./RInputFiles/data_3_2.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## sp500 = col_double(),
## citigroup = col_double(),
## microsoft = col_double(),
## apple = col_double(),
## dowchemical = col_double(),
## yahoo = col_double()
## )
# names(tmpData) <- c("Index", "apple")
my_data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
citi <- as.numeric(my_data$citigroup)
sp500 <- as.numeric(my_data$sp500)
# Draw the scatterplot
plot(y=citi, x=sp500)
# Draw a regression line
abline(reg=lm(citi ~ sp500), col="red", lwd=2)
# my_data containing the returns for 5 stocks: ExxonMobile, Citigroup, Microsoft, Dow Chemical and Yahoo
# Create correlation matrix using Pearson method
cor(my_data)
## sp500 citigroup microsoft apple dowchemical yahoo
## sp500 1.0000000 0.5097953 0.3743215 0.3576966 0.5217243 0.2900962
## citigroup 0.5097953 1.0000000 0.4841408 0.4291841 0.5085190 0.4029490
## microsoft 0.3743215 0.4841408 1.0000000 0.5133469 0.3954523 0.4329388
## apple 0.3576966 0.4291841 0.5133469 1.0000000 0.3627755 0.3413626
## dowchemical 0.5217243 0.5085190 0.3954523 0.3627755 1.0000000 0.2938749
## yahoo 0.2900962 0.4029490 0.4329388 0.3413626 0.2938749 1.0000000
# Create correlation matrix using Spearman method
cor(my_data, method="spearman")
## sp500 citigroup microsoft apple dowchemical yahoo
## sp500 1.0000000 0.5192579 0.4244237 0.3518853 0.5316235 0.3262037
## citigroup 0.5192579 1.0000000 0.4976477 0.4374850 0.5607511 0.3780730
## microsoft 0.4244237 0.4976477 1.0000000 0.5128477 0.4684114 0.4448179
## apple 0.3518853 0.4374850 0.5128477 1.0000000 0.3681791 0.3680715
## dowchemical 0.5316235 0.5607511 0.4684114 0.3681791 1.0000000 0.3464743
## yahoo 0.3262037 0.3780730 0.4448179 0.3680715 0.3464743 1.0000000
# Create scatterplot matrix
pairs(as.data.frame(my_data))
# Create upper panel scatterplot matrix
pairs(as.data.frame(my_data), lower.panel=NULL)
cor_mat <- cor(my_data)
# In this exercise, you will use the provided correlation matrix cor_mat
# Create correlation matrix
corrplot::corrplot(cor_mat)
# Create correlation matrix with numbers
corrplot::corrplot(cor_mat, method="number")
# Create correlation matrix with colors
corrplot::corrplot(cor_mat, method="color")
# Create upper triangle correlation matrix
corrplot::corrplot(cor_mat, method="number", type="upper")
# Draw heatmap of cor_mat
corrplot::corrplot(cor_mat, method="color")
# Draw upper heatmap
corrplot::corrplot(cor_mat, method="color", type="upper")
# Draw the upper heatmap with hclust
corrplot::corrplot(cor_mat, method="color", type="upper", order="hclust")
Chapter 4 - Case Study: Stock Picking for Portfolios
Case study presentation:
New stocks:
Course conclusion:
Example code includes:
# In this exercise, you are provided with a dataset data containing the value and the return of the portfolio over time, in value and return, respectively.
tmpData <- readr::read_delim("./RInputFiles/data_4_1.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## value = col_double(),
## return = col_double()
## )
# names(tmpData) <- c("Index", "apple")
data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot the portfolio value
plot(data$value, main="Portfolio Value")
# Plot the portfolio return
plot(data$return, main="Portfolio Return")
# Plot a histogram of portfolio return
hist(data$return, probability=TRUE)
# Add a density line
lines(density(data$return), col="red", lwd=2)
tmpPortfolioData <- data
# The new dataset data containing four new stocks is available in your workspace: Goldman Sachs (GS), Coca-Cola (KO), Walt Disney (DIS), Caterpillar (CAT)
tmpData <- readr::read_delim("./RInputFiles/data_4_3.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## GS = col_double(),
## KO = col_double(),
## DIS = col_double(),
## CAT = col_double()
## )
# names(tmpData) <- c("Index", "apple")
data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot the four stocks on the same graphical window
par(mfrow=c(2, 2), mex=0.8, cex=0.8)
plot(data[, 1])
plot(data[, 2])
plot(data[, 3])
plot(data[, 4])
par(mfrow=c(1, 1), mex=1, cex=1)
# In this exercise, you are provided with four individual series containing the return of the same four stocks:
# gs, ko, dis, cat
# Solution makes absolutely no sense
portfolio <- as.numeric(tmpPortfolioData$return)
gs <- as.numeric(TTR::ROC(data[, "GS"]))[-1]
ko <- as.numeric(TTR::ROC(data[, "KO"]))[-1]
dis <- as.numeric(TTR::ROC(data[, "DIS"]))[-1]
cat <- as.numeric(TTR::ROC(data[, "CAT"]))[-1]
# Draw the scatterplot of gs against the portfolio
plot(y=portfolio, x=gs)
# Add a regression line in red
abline(reg=lm(gs ~ portfolio), col="red", lwd=2)
# Plot scatterplots and regression lines to a 2x2 window
par(mfrow=c(2, 2))
plot(y=portfolio, x=gs)
abline(reg=lm(gs ~ portfolio), col="red", lwd=2)
plot(y=portfolio, x=ko)
abline(reg=lm(ko ~ portfolio), col="red", lwd=2)
plot(y=portfolio, x=dis)
abline(reg=lm(dis ~ portfolio), col="red", lwd=2)
plot(y=portfolio, x=cat)
abline(reg=lm(cat ~ portfolio), col="red", lwd=2)
par(mfrow=c(1, 1))
# In this exercise, you are given a dataset old.vs.new.portfolio with the following self-explanatory columns: old.portfolio.value, new.portfolio.value, old.portfolio.rtn, new.portfolio.rtn
tmpData <- readr::read_delim("./RInputFiles/old.vs.new.portfolio.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## old.portfolio.value = col_double(),
## new.portfolio.value = col_double(),
## old.portfolio.rtn = col_double(),
## new.portfolio.rtn = col_double()
## )
# names(tmpData) <- c("Index", "apple")
old.vs.new.portfolio <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot new and old portfolio values on same chart
plot(old.vs.new.portfolio$old.portfolio.value)
lines(old.vs.new.portfolio$new.portfolio.value, col = "red")
# Plot density of the new and old portfolio returns on same chart
plot(density(old.vs.new.portfolio$old.portfolio.rtn))
lines(density(old.vs.new.portfolio$new.portfolio.rtn), col ="red")
# Draw value, return, drawdowns of old portfolio
PerformanceAnalytics::charts.PerformanceSummary(old.vs.new.portfolio[, "old.portfolio.rtn"])
# Draw value, return, drawdowns of new portfolio
PerformanceAnalytics::charts.PerformanceSummary(old.vs.new.portfolio[, "new.portfolio.rtn"])
# Draw both portfolios on same chart
# Draw value, return, drawdowns of new portfolio
PerformanceAnalytics::charts.PerformanceSummary(old.vs.new.portfolio[, c("old.portfolio.rtn", "new.portfolio.rtn")])
Chapter 1 - Custom ggplot2 themes
Introduction to the data - finding stories in datasets:
Filtering and plotting the data:
Custom ggplot2 themes - providing a custom look to a chart:
Example code includes:
library(ggplot2)
load("./RInputFiles/ilo_hourly_compensation.RData")
load("./RInputFiles/ilo_working_hours.RData")
# Join both data frames
ilo_data <- ilo_hourly_compensation %>%
inner_join(ilo_working_hours, by = c("country", "year"))
# Count the resulting rows
ilo_data %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 612
# Examine ilo_data
ilo_data
## # A tibble: 612 x 4
## country year hourly_compensation working_hours
## <chr> <chr> <dbl> <dbl>
## 1 Australia 1980.0 8.44 34.6
## 2 Canada 1980.0 8.87 34.8
## 3 Denmark 1980.0 10.8 31.9
## 4 Finland 1980.0 8.61 35.6
## 5 France 1980.0 8.90 35.4
## 6 Italy 1980.0 8.09 35.7
## 7 Japan 1980.0 5.46 40.8
## 8 Korea, Rep. 1980.0 0.950 55.3
## 9 Norway 1980.0 11.8 30.4
## 10 Spain 1980.0 5.86 36.8
## # ... with 602 more rows
# Turn year into a factor
ilo_data <- ilo_data %>%
mutate(year = as.factor(as.numeric(year)))
# Turn country into a factor
ilo_data <- ilo_data %>%
mutate(country = as.factor(country))
# Examine the European countries vector
european_countries <- c('Finland', 'France', 'Italy', 'Norway', 'Spain', 'Sweden', 'Switzerland', 'United Kingdom', 'Belgium', 'Ireland', 'Luxembourg', 'Portugal', 'Netherlands', 'Germany', 'Hungary', 'Austria', 'Czech Rep.')
european_countries
## [1] "Finland" "France" "Italy" "Norway"
## [5] "Spain" "Sweden" "Switzerland" "United Kingdom"
## [9] "Belgium" "Ireland" "Luxembourg" "Portugal"
## [13] "Netherlands" "Germany" "Hungary" "Austria"
## [17] "Czech Rep."
# Only retain European countries
ilo_data <- ilo_data %>%
filter(country %in% european_countries)
# Examine the structure of ilo_data
str(ilo_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 380 obs. of 4 variables:
## $ country : Factor w/ 30 levels "Australia","Austria",..: 8 9 15 22 25 27 28 29 8 9 ...
## $ year : Factor w/ 27 levels "1980","1981",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ hourly_compensation: num 8.61 8.9 8.09 11.8 5.86 ...
## $ working_hours : num 35.6 35.4 35.7 30.4 36.8 ...
# Group and summarize the data
ilo_data %>%
group_by(year) %>%
summarize(mean_hourly_compensation = mean(hourly_compensation),
mean_working_hours = mean(working_hours))
## # A tibble: 27 x 3
## year mean_hourly_compensation mean_working_hours
## <fct> <dbl> <dbl>
## 1 1980 9.27 34.0
## 2 1981 8.69 33.6
## 3 1982 8.36 33.5
## 4 1983 7.81 33.9
## 5 1984 7.54 33.7
## 6 1985 7.79 33.7
## 7 1986 9.70 34.0
## 8 1987 12.1 33.6
## 9 1988 13.2 33.7
## 10 1989 13.1 33.5
## # ... with 17 more rows
# Filter for 2006
plot_data <- ilo_data %>%
filter(year == 2006)
# Create the scatter plot
ggplot(plot_data) +
geom_point(aes(x = working_hours, y = hourly_compensation))
# Create the plot
ggplot(plot_data) +
geom_point(aes(x = working_hours, y = hourly_compensation)) +
# Add labels
labs(
x = "Working hours per week",
y = "Hourly compensation",
title = "The more people work, the less compensation they seem to receive",
subtitle = "Working hours and hourly compensation in European countries, 2006",
caption = "Data source: ILO, 2017"
)
# Save your current plot into a variable: ilo_plot
ilo_plot <- ggplot(plot_data) +
geom_point(aes(x = working_hours, y = hourly_compensation)) +
labs(
x = "Working hours per week",
y = "Hourly compensation",
title = "The more people work, the less compensation they seem to receive",
subtitle = "Working hours and hourly compensation in European countries, 2006",
caption = "Data source: ILO, 2017"
)
# Try out theme_minimal
ilo_plot +
theme_minimal()
# Try out any other possible theme function
ilo_plot +
theme_linedraw()
windowsFonts(Bookman=windowsFont("Bookman Old Style"))
ilo_plot <- ilo_plot +
theme_minimal() +
# Customize the "minimal" theme with another custom "theme" call
theme(
text = element_text(family = "Bookman"),
title = element_text(color = "gray25"),
plot.subtitle = element_text(size=12),
plot.caption = element_text(color = "gray30")
)
# Render the plot object
ilo_plot
ilo_plot +
# "theme" calls can be stacked upon each other, so this is already the third call of "theme"
theme(
plot.background = element_rect(fill = "gray95"),
plot.margin = unit(c(5, 10, 5, 10), units = "mm")
)
Chapter 2 - Creating Custom and Unique Visualization
Visualizing aspects of data with facets:
Custom plot to emphasize change:
Polishing the dot plot:
Finalizing plots for different audiences and devices:
Example code includes:
# Filter ilo_data to retain the years 1996 and 1996
ilo_data <- ilo_data %>%
filter(year == 1996 | year == 2006)
# Again, you save the plot object into a variable so you can save typing later on
ilo_plot <- ggplot(ilo_data, aes(x = working_hours, y = hourly_compensation)) +
geom_point() +
labs(
x = "Working hours per week",
y = "Hourly compensation",
title = "The more people work, the less compensation they seem to receive",
subtitle = "Working hours and hourly compensation in European countries, 2006",
caption = "Data source: ILO, 2017"
) +
# Add facets here
facet_grid(facets = . ~ year)
ilo_plot
# For a starter, let's look at what you did before: adding various theme calls to your plot object
ilo_plot +
theme_minimal() +
theme(
text = element_text(family = "Bookman", color = "gray25"),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(color = "gray30"),
plot.background = element_rect(fill = "gray95"),
plot.margin = unit(c(5, 10, 5, 10), units = "mm")
)
# Define your own theme function below
theme_ilo <- function() {
theme_minimal() +
theme(
text = element_text(family = "Bookman", color = "gray25"),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(color = "gray30"),
plot.background = element_rect(fill = "gray95"),
plot.margin = unit(c(5, 10, 5, 10), units = "mm"))
}
# Apply your theme function
ilo_plot <- ilo_plot + theme_ilo()
# Examine ilo_plot
ilo_plot
ilo_plot +
# Add another theme call
theme(
# Change the background fill to make it a bit darker
strip.background = element_rect(fill = "gray60", color = "gray95"),
# Make text a bit bigger and change its color to white
strip.text = element_text(size = 11, color = "white")
)
# Create the dot plot
ggplot(ilo_data) +
geom_path(aes(x=working_hours, y=country))
ggplot(ilo_data) +
geom_path(aes(x = working_hours, y = country),
# Add an arrow to each path
arrow = arrow(length = unit(1.5, "mm"), type = "closed"))
ggplot(ilo_data) +
geom_path(aes(x = working_hours, y = country),
arrow = arrow(length = unit(1.5, "mm"), type = "closed")) +
# Add a geom_text() geometry
geom_text(
aes(x = working_hours,
y = country,
label = round(working_hours, 1))
)
library(forcats)
# Reorder country factor levels
ilo_data <- ilo_data %>%
# Arrange data frame
arrange(country, year) %>%
# Reorder countries by working hours in 2006
mutate(country = fct_reorder(country,
working_hours,
last))
# Plot again
ggplot(ilo_data) +
geom_path(aes(x = working_hours, y = country),
arrow = arrow(length = unit(1.5, "mm"), type = "closed")) +
geom_text(
aes(x = working_hours,
y = country,
label = round(working_hours, 1))
)
# Save plot into an object for reuse
ilo_dot_plot <- ggplot(ilo_data) +
geom_path(aes(x = working_hours, y = country),
arrow = arrow(length = unit(1.5, "mm"), type = "closed")) +
# Specify the hjust aesthetic with a conditional value
geom_text(
aes(x = working_hours,
y = country,
label = round(working_hours, 1),
hjust = ifelse(year == "2006", 1.4, -0.4)
),
# Change the appearance of the text
size = 3,
family = "Bookman",
color = "gray25"
)
ilo_dot_plot
# Reuse ilo_dot_plot
ilo_dot_plot <- ilo_dot_plot +
# Add labels to the plot
labs(
x = "Working hours per week",
y = "Country",
title = "People work less in 2006 compared to 1996",
subtitle = "Working hours in European countries, development since 1996",
caption = "Data source: ILO, 2017"
) +
# Apply your theme
theme_ilo() +
# Change the viewport
coord_cartesian(xlim = c(25, 41))
# View the plot
ilo_dot_plot
# Compute temporary data set for optimal label placement
median_working_hours <- ilo_data %>%
group_by(country) %>%
summarize(median_working_hours_per_country = median(working_hours)) %>%
ungroup()
# Have a look at the structure of this data set
str(median_working_hours)
## Classes 'tbl_df', 'tbl' and 'data.frame': 17 obs. of 2 variables:
## $ country : Factor w/ 30 levels "Netherlands",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ median_working_hours_per_country: num 27 27.8 28.4 31 30.9 ...
ilo_dot_plot +
# Add label for country
geom_text(data = median_working_hours,
aes(y = country,
x = median_working_hours_per_country,
label = country),
vjust = -0.5,
size=3,
family = "Bookman",
color = "gray25") +
# Remove axes and grids
theme(
axis.ticks = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
panel.grid = element_blank(),
# Also, let's reduce the font size of the subtitle
plot.subtitle = element_text(size = 9)
)
Chapter 3 - Introduction to R Markdown
What is R Markdown?
Formatting with R Markdown:
R Code in R Markdown Documents:
Images in R Markdown Files:
Example code is contained in the summary Excel worksheet.
Chapter 4 - Customizing R Markdown Reports
Advanced YAML Settings (YAML is a recursive name meaning YAML and Markup Language):
Custom stylesheets - creating a unique theme for a report:
Beautiful tables:
Summary:
Example code is contained in the summary Excel worksheet.
Chapter 1 - Binomial Distribution
Flipping coins in R - for example, rbinom(1, 1, 0.5) - 1 draw of 1 coint with 50% of being heads:
Density and cumulative density:
Expected value and variance:
Example code includes:
# Generate 10 separate random flips with probability .3
rbinom(10, 1, 0.3)
## [1] 0 1 0 0 1 1 0 0 1 0
# Generate 100 occurrences of flipping 10 coins, each with 30% probability
rbinom(100, 10, 0.3)
## [1] 2 1 6 4 2 3 3 6 8 5 1 1 3 7 1 5 4 6 4 3 4 2 4 2 4 1 2 5 1 7 2 5 2 5 3
## [36] 4 5 2 3 3 0 4 3 3 5 2 4 1 2 3 2 1 4 5 4 0 5 6 5 2 1 2 3 2 2 4 2 5 3 5
## [71] 3 4 1 2 4 1 3 2 6 3 4 2 4 6 6 2 2 2 4 6 4 4 2 1 4 3 0 4 3 3
# Calculate the probability that 2 are heads using dbinom
dbinom(2, 10, 0.3)
## [1] 0.2334744
# Confirm your answer with a simulation using rbinom
mean(rbinom(10000, 10, 0.3) == 2)
## [1] 0.2353
# Calculate the probability that at least five coins are heads
1 - pbinom(4, 10, 0.3)
## [1] 0.1502683
# Confirm your answer with a simulation of 10,000 trials
mean(rbinom(10000, 10, 0.3) >= 5)
## [1] 0.1533
# Here is how you computed the answer in the last problem
mean(rbinom(10000, 10, .3) >= 5)
## [1] 0.149
# Try now with 100, 1000, 10,000, and 100,000 trials
mean(rbinom(100, 10, .3) >= 5)
## [1] 0.16
mean(rbinom(1000, 10, .3) >= 5)
## [1] 0.158
mean(rbinom(10000, 10, .3) >= 5)
## [1] 0.1518
mean(rbinom(100000, 10, .3) >= 5)
## [1] 0.15187
# Calculate the expected value using the exact formula
25 * 0.3
## [1] 7.5
# Confirm with a simulation using rbinom
mean(rbinom(10000, 25, 0.3))
## [1] 7.4447
# Calculate the variance using the exact formula
25 * 0.3 * (1 - 0.3)
## [1] 5.25
# Confirm with a simulation using rbinom
var(rbinom(10000, 25, 0.3))
## [1] 5.15845
Chapter 2 - Laws of Probability
Probability of Event A and Event B:
Probability of A or B:
Multiplying random variables:
Adding random variables:
Example code includes:
# Simulate 100,000 flips of a coin with a 40% chance of heads
A <- rbinom(100000, 1, 0.4)
# Simulate 100,000 flips of a coin with a 20% chance of heads
B <- rbinom(100000, 1, 0.2)
# Estimate the probability both A and B are heads
mean(A & B)
## [1] 0.0805
# You've already simulated 100,000 flips of coins A and B
A <- rbinom(100000, 1, .4)
B <- rbinom(100000, 1, .2)
# Simulate 100,000 flips of coin C (70% chance of heads)
C <- rbinom(100000, 1, .7)
# Estimate the probability A, B, and C are all heads
mean(A & B & C)
## [1] 0.05589
# Simulate 100,000 flips of a coin with a 60% chance of heads
A <- rbinom(100000, 1, 0.6)
# Simulate 100,000 flips of a coin with a 10% chance of heads
B <- rbinom(100000, 1, 0.1)
# Estimate the probability either A or B is heads
mean(A | B)
## [1] 0.63736
# Use rbinom to simulate 100,000 draws from each of X and Y
X <- rbinom(100000, 10, 0.6)
Y <- rbinom(100000, 10, 0.7)
# Estimate the probability either X or Y is <= to 4
mean((X <= 4) | (Y <= 4))
## [1] 0.20613
# Use pbinom to calculate the probabilities separately
prob_X_less <- pbinom(4, 10, 0.6)
prob_Y_less <- pbinom(4, 10, 0.7)
# Combine these to calculate the exact probability either <= 4
prob_X_less + prob_Y_less - prob_X_less * prob_Y_less
## [1] 0.2057164
# Simulate 100,000 draws of a binomial with size 20 and p = .1
X <- rbinom(100000, 20, 0.1)
# Estimate the expected value of X
mean(X)
## [1] 1.9991
# Estimate the expected value of 5 * X
mean(5 * X)
## [1] 9.9955
# Estimate the variance of X
var(X)
## [1] 1.786197
# Estimate the variance of 5 * X
var(5 * X)
## [1] 44.65493
# Simulate 100,000 draws of X (size 20, p = .3) and Y (size 40, p = .1)
X <- rbinom(100000, 20, 0.3)
Y <- rbinom(100000, 40, 0.1)
# Estimate the expected value of X + Y
mean(X + Y)
## [1] 9.99048
# Find the variance of X + Y
var(X + Y)
## [1] 7.798627
# Find the variance of 3 * X + Y
var(3 * X + Y)
## [1] 41.20331
Chapter 3 - Bayesian Statistics
Updating with evidence:
Prior probability - may not be equal odds prior to seeing any evidence:
Bayes theorem:
Example code includes:
# Simulate 50000 cases of flipping 20 coins from fair and from biased
fair <- rbinom(50000, 20, 0.5)
biased <- rbinom(50000, 20, 0.75)
# How many fair cases, and how many biased, led to exactly 11 heads?
fair_11 <- sum(fair == 11)
biased_11 <- sum(biased == 11)
# Find the fraction of fair coins that are 11 out of all coins that were 11
fair_11 / (fair_11 + biased_11)
## [1] 0.8487457
# How many fair cases, and how many biased, led to exactly 16 heads?
fair_16 <- sum(fair == 16)
biased_16 <- sum(biased == 16)
# Find the fraction of fair coins that are 16 out of all coins that were 16
fair_16 / (fair_16 + biased_16)
## [1] 0.02418033
# Simulate 8000 cases of flipping a fair coin, and 2000 of a biased coin
fair_flips <- rbinom(8000, 20, 0.5)
biased_flips <- rbinom(2000, 20, 0.75)
# Find the number of cases from each coin that resulted in 14/20
fair_14 <- sum(fair_flips == 14)
biased_14 <- sum(biased_flips == 14)
# Use these to estimate the posterior probability
fair_14 / (fair_14 + biased_14)
## [1] 0.4651515
# Simulate 80,000 draws from fair coin, 10,000 from each of high and low coins
flips_fair <- rbinom(80000, 20, 0.5)
flips_high <- rbinom(10000, 20, 0.75)
flips_low <- rbinom(10000, 20, 0.25)
# Compute the number of coins that resulted in 14 heads from each of these piles
fair_14 <- sum(flips_fair == 14)
high_14 <- sum(flips_high == 14)
low_14 <- sum(flips_low == 14)
# Compute the posterior probability that the coin was fair
fair_14 / (fair_14 + high_14 + low_14)
## [1] 0.6370197
# Use dbinom to calculate the probability of 11/20 heads with fair or biased coin
probability_fair <- dbinom(11, 20, 0.5)
probability_biased <- dbinom(11, 20, 0.75)
# Calculate the posterior probability that the coin is fair
probability_fair / (probability_fair + probability_biased)
## [1] 0.8554755
# Find the probability that a coin resulting in 14/20 is fair
probability_fair <- dbinom(14, 20, .5)
probability_biased <- dbinom(14, 20, .75)
probability_fair / (probability_fair + probability_biased)
## [1] 0.179811
# Find the probability that a coin resulting in 18/20 is fair
probability_fair <- dbinom(18, 20, .5)
probability_biased <- dbinom(18, 20, .75)
probability_fair / (probability_fair + probability_biased)
## [1] 0.002699252
# Use dbinom to find the probability of 16/20 from a fair or biased coin
probability_16_fair <- dbinom(16, 20, 0.5)
probability_16_biased <- dbinom(16, 20, 0.75)
# Use Bayes' theorem to find the posterior probability that the coin is fair
(probability_16_fair * 0.99) / (probability_16_fair * 0.99 + probability_16_biased * 0.01)
## [1] 0.7068775
Chapter 4 - Related Distributions
Normal distribution - symmetrical bell curve, Gaussian:
Poisson distribution - approximates the binomial under the assumption of a large number of trials each with a low probability:
Geometric distribution - example of flipping a coin with probability p and assessing when the first success occurs:
Example code includes:
compare_histograms <- function(variable1, variable2) {
x <- data.frame(value = variable1, variable = "Variable 1")
y <- data.frame(value = variable2, variable = "Variable 2")
ggplot(rbind(x, y), aes(value)) +
geom_histogram() +
facet_wrap(~ variable, nrow = 2)
}
# Draw a random sample of 100,000 from the Binomial(1000, .2) distribution
binom_sample <- rbinom(100000, 1000, 0.2)
# Draw a random sample of 100,000 from the normal approximation
normal_sample <- rnorm(100000, 200, sqrt(160))
# Compare the two distributions with the compare_histograms function
compare_histograms(binom_sample, normal_sample)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Use binom_sample to estimate the probability of <= 190 heads
mean(binom_sample <= 190)
## [1] 0.2292
# Use normal_sample to estimate the probability of <= 190 heads
mean(normal_sample <= 190)
## [1] 0.2152
# Calculate the probability of <= 190 heads with pbinom
pbinom(190, 1000, 0.2)
## [1] 0.2273564
# Calculate the probability of <= 190 heads with pnorm
pnorm(190, 200, sqrt(160))
## [1] 0.2145977
# Draw a random sample of 100,000 from the Binomial(10, .2) distribution
binom_sample <- rbinom(100000, 10, 0.2)
# Draw a random sample of 100,000 from the normal approximation
normal_sample <- rnorm(100000, 2, sqrt(1.6))
# Compare the two distributions with the compare_histograms function
compare_histograms(binom_sample, normal_sample)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Draw a random sample of 100,000 from the Binomial(1000, .002) distribution
binom_sample <- rbinom(100000, 1000, 0.002)
# Draw a random sample of 100,000 from the Poisson approximation
poisson_sample <- rpois(100000, 2)
# Compare the two distributions with the compare_histograms function
compare_histograms(binom_sample, poisson_sample)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Find the percentage of simulated values that are 0
mean(poisson_sample == 0)
## [1] 0.13513
# Use dpois to find the exact probability that a draw is 0
dpois(0, 2)
## [1] 0.1353353
# Simulate 100,000 draws from Poisson(1)
X <- rpois(100000, 1)
# Simulate 100,000 draws from Poisson(2)
Y <- rpois(100000, 2)
# Add X and Y together to create Z
Z <- X + Y
# Use compare_histograms to compare Z to the Poisson(3)
compare_histograms(Z, rpois(100000, 3))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Simulate 100 instances of flipping a 20% coin
flips <- rbinom(100, 1, 0.2)
# Use which to find the first case of 1 ("heads")
which(flips == 1)[1]
## [1] 6
# Existing code for finding the first instance of heads
which(rbinom(100, 1, .2) == 1)[1]
## [1] 5
# Replicate this 100,000 times using replicate()
replications <- replicate(100000, which(rbinom(100, 1, .2) == 1)[1])
# Histogram the replications with qplot
qplot(replications)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Generate 100,000 draws from the corresponding geometric distribution
geom_sample <- rgeom(100000, 0.2)
# Compare the two distributions with compare_histograms
compare_histograms(replications, geom_sample)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Find the probability the machine breaks on 5th day or earlier
pgeom(4, 0.1)
## [1] 0.40951
# Find the probability the machine is still working on 20th day
1 - pgeom(19, 0.1)
## [1] 0.1215767
# Calculate the probability of machine working on day 1-30
still_working <- 1 - pgeom(0:29, 0.1)
# Plot the probability for days 1 to 30
qplot(1:30, still_working)
Chapter 1 - Bootstrapping for Parameter Estimates
Introduction - beginning with bootstrapping approach:
Percentile and standard error methods:
Re-centering bootstrap distributions for hypothesis testing:
Example code includes:
manhattan <- readr::read_csv("./RInputFiles/manhattan.csv")
## Parsed with column specification:
## cols(
## rent = col_integer()
## )
# Will need to either call library(infer) or add infer:: to this code
library(infer)
# Generate bootstrap distribution of medians
rent_ci_med <- manhattan %>%
# Specify the variable of interest
specify(response = rent) %>%
# Generate 15000 bootstrap samples
generate(reps = 15000, type = "bootstrap") %>%
# Calculate the median of each bootstrap sample
calculate(stat = "median")
# View the structure of rent_ci_med
str(rent_ci_med)
## Classes 'tbl_df', 'tbl' and 'data.frame': 15000 obs. of 2 variables:
## $ replicate: int 1 2 3 4 5 6 7 8 9 10 ...
## $ stat : num 2422 2350 2262 2325 2350 ...
## - attr(*, "response")= symbol rent
# Plot a histogram of rent_ci_med
ggplot(rent_ci_med, aes(x=stat)) +
geom_histogram(binwidth=50)
# Percentile method
rent_ci_med %>%
summarize(l = quantile(stat, 0.025),
u = quantile(stat, 0.975))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 2162 2875
# Standard error method
# Calculate observed median
rent_med_obs <- manhattan %>%
# Calculate observed median rent
summarize(median(rent)) %>%
# Extract numerical value
pull()
# Determine critical value
t_star <- qt(0.975, df = nrow(manhattan) - 1)
# Construct interval
rent_ci_med %>%
summarize(boot_se = sd(rent_ci_med$stat)) %>%
summarize(l = rent_med_obs - t_star * boot_se,
u = rent_med_obs + t_star * boot_se)
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 1994 2706
data(ncbirths, package="openintro")
str(ncbirths)
## 'data.frame': 1000 obs. of 13 variables:
## $ fage : int NA NA 19 21 NA NA 18 17 NA 20 ...
## $ mage : int 13 14 15 15 15 15 15 15 16 16 ...
## $ mature : Factor w/ 2 levels "mature mom","younger mom": 2 2 2 2 2 2 2 2 2 2 ...
## $ weeks : int 39 42 37 41 39 38 37 35 38 37 ...
## $ premie : Factor w/ 2 levels "full term","premie": 1 1 1 1 1 1 1 2 1 1 ...
## $ visits : int 10 15 11 6 9 19 12 5 9 13 ...
## $ marital : Factor w/ 2 levels "married","not married": 1 1 1 1 1 1 1 1 1 1 ...
## $ gained : int 38 20 38 34 27 22 76 15 NA 52 ...
## $ weight : num 7.63 7.88 6.63 8 6.38 5.38 8.44 4.69 8.81 6.94 ...
## $ lowbirthweight: Factor w/ 2 levels "low","not low": 2 2 2 2 2 1 2 1 2 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 2 2 2 1 ...
## $ habit : Factor w/ 2 levels "nonsmoker","smoker": 1 1 1 1 1 1 1 1 1 1 ...
## $ whitemom : Factor w/ 2 levels "not white","white": 1 1 2 2 1 1 1 1 2 2 ...
# Remove NA visits
ncbirths_complete_visits <- ncbirths %>%
filter(!is.na(visits))
# Generate 15000 bootstrap means
visit_ci_mean <- ncbirths_complete_visits %>%
specify(response=visits) %>%
generate(reps=15000, type="bootstrap") %>%
calculate(stat="mean")
# Calculate the 90% CI via percentile method
visit_ci_mean %>%
summarize(l = quantile(stat, 0.05),
u = quantile(stat, 0.95))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 11.9 12.3
# Calculate 15000 bootstrap SDs
visit_ci_sd <- ncbirths_complete_visits %>%
specify(response=visits) %>%
generate(reps=15000, type="bootstrap") %>%
calculate(stat="sd")
# Calculate the 90% CI via percentile method
visit_ci_sd %>%
summarize(l = quantile(stat, 0.05),
u = quantile(stat, 0.95))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 3.74 4.16
# Generate 15000 bootstrap samples centered at null
rent_med_ht <- manhattan %>%
specify(response = rent) %>%
hypothesize(null = "point", med = 2500) %>%
generate(reps = 15000, type = "bootstrap") %>%
calculate(stat = "median")
# Calculate observed median
rent_med_obs <- manhattan %>%
summarize(median(rent)) %>%
pull()
# Calculate p-value
rent_med_ht %>%
filter(stat > rent_med_obs) %>%
summarize(n() / 15000)
## # A tibble: 1 x 1
## `n()/15000`
## <dbl>
## 1 0.948
# Generate 1500 bootstrap means centered at null
weight_mean_ht <- ncbirths %>%
specify(response = weight) %>%
hypothesize(null = "point", mu = 7) %>%
generate(reps=1500, type="bootstrap") %>%
calculate(stat="mean")
# Calculate observed mean
weight_mean_obs <- ncbirths %>%
summarize(mean(weight)) %>%
pull()
# Calculate p-value
weight_mean_ht %>%
filter(stat > weight_mean_obs) %>%
summarize((n()/1500) * 2)
## # A tibble: 1 x 1
## `(n()/1500) * 2`
## <dbl>
## 1 0.0253
Chapter 2 - Introducing the t-distribution
The t-distribution - especially useful when the population standard deviation is unknown (as is typically the case):
Estimating a mean with a t-interval:
The t-interval for paired data:
Testing a mean with a t-test:
Example code includes:
# P(T < 3) for df = 10
(x <- pt(3, df = 10))
## [1] 0.9933282
# P(T > 3) for df = 10
(y <- 1 - pt(3, df=10))
## [1] 0.006671828
# P(T > 3) for df = 100
(z <- 1 - pt(3, df=100))
## [1] 0.001703958
# Comparison
y == z
## [1] FALSE
y > z
## [1] TRUE
y < z
## [1] FALSE
# 95th percentile for df = 10
(x <- qt(0.95, df = 10))
## [1] 1.812461
# upper bound of middle 95th percent for df = 10
(y <- qt(0.975, df = 10))
## [1] 2.228139
# upper bound of middle 95th percent for df = 100
(z <- qt(0.975, df = 100))
## [1] 1.983972
# Comparison
y == z
## [1] FALSE
y > z
## [1] TRUE
y < z
## [1] FALSE
data(acs12, package="openintro")
# Subset for employed respondents
acs12_emp <- acs12 %>%
filter(employment == "employed")
# Construct 95% CI for avg time_to_work
t.test(acs12_emp$time_to_work, conf.level=0.95)
##
## One Sample t-test
##
## data: acs12_emp$time_to_work
## t = 32.635, df = 782, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 24.43369 27.56120
## sample estimates:
## mean of x
## 25.99745
t.test(acs12_emp$hrs_work, conf.level=0.95)
##
## One Sample t-test
##
## data: acs12_emp$hrs_work
## t = 87.521, df = 842, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 38.05811 39.80429
## sample estimates:
## mean of x
## 38.9312
data(textbooks, package="openintro")
# 90% CI
t.test(textbooks$diff, conf.level = 0.9)
##
## One Sample t-test
##
## data: textbooks$diff
## t = 7.6488, df = 72, p-value = 6.928e-11
## alternative hypothesis: true mean is not equal to 0
## 90 percent confidence interval:
## 9.981505 15.541783
## sample estimates:
## mean of x
## 12.76164
# 95% CI
t.test(textbooks$diff, conf.level = 0.95)
##
## One Sample t-test
##
## data: textbooks$diff
## t = 7.6488, df = 72, p-value = 6.928e-11
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 9.435636 16.087652
## sample estimates:
## mean of x
## 12.76164
# 99% CI
t.test(textbooks$diff, conf.level = 0.99)
##
## One Sample t-test
##
## data: textbooks$diff
## t = 7.6488, df = 72, p-value = 6.928e-11
## alternative hypothesis: true mean is not equal to 0
## 99 percent confidence interval:
## 8.347154 17.176133
## sample estimates:
## mean of x
## 12.76164
# Conduct HT
t.test(textbooks$diff, mu=0, alternative="two.sided", conf.level=0.95)
##
## One Sample t-test
##
## data: textbooks$diff
## t = 7.6488, df = 72, p-value = 6.928e-11
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 9.435636 16.087652
## sample estimates:
## mean of x
## 12.76164
# Calculate 15000 bootstrap means
textdiff_med_ci <- textbooks %>%
specify(response = diff) %>%
generate(reps=15000, type="bootstrap") %>%
calculate(stat = "median")
# Calculate the 95% CI via percentile method
textdiff_med_ci %>%
summarize(l=quantile(stat, 0.025),
u=quantile(stat, 0.975))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 5.04 11.7
data(hsb2, package="openintro")
# Calculate diff
hsb2 <- hsb2 %>%
mutate(diff = math - science)
# Generate 15000 bootstrap means centered at null
scorediff_med_ht <- hsb2 %>%
specify(response=diff) %>%
hypothesize(null="point", mu=0) %>%
generate(reps=15000, type="bootstrap") %>%
calculate(stat="median")
# Calculate observed median of differences
scorediff_med_obs <- hsb2 %>%
summarize(median(diff)) %>%
pull()
# Calculate p-value
scorediff_med_ht %>%
filter(stat > scorediff_med_obs) %>%
summarize(p_val = (n() / 15000) * 2)
## # A tibble: 1 x 1
## p_val
## <dbl>
## 1 0.529
Chapter 3 - Inference for Difference in Two Parameters
Hypothesis testing for comparing two means:
Bootstrap CI for difference in two means:
Comparing means with a t-test:
Example code includes:
data(stem.cell, package="openintro")
str(stem.cell)
## 'data.frame': 18 obs. of 3 variables:
## $ trmt : Factor w/ 2 levels "ctrl","esc": 1 1 1 1 1 1 1 1 1 2 ...
## $ before: num 35.2 36.5 39.8 39.8 41.8 ...
## $ after : num 29.5 29.5 36.2 38 37.5 ...
# Calculate difference between before and after
stem.cell <- stem.cell %>%
mutate(change = after - before)
# Calculate observed difference in means
diff_mean <- stem.cell %>%
# Group by treatment group
group_by(trmt) %>%
# Calculate mean change for each group
summarize(mean_change = mean(change)) %>%
# Extract
pull() %>%
# Calculate difference
diff()
# Generate 1000 differences in means via randomization
diff_ht_mean <- stem.cell %>%
# y ~ x
specify(change ~ trmt) %>%
# Null = no difference between means
hypothesize(null = "independence") %>%
# Shuffle labels 1000 times
generate(reps = 1000, type = "permute") %>%
# Calculate test statistic
calculate(stat = "diff in means", order=rev(levels(stem.cell$trmt)))
# Calculate p-value
diff_ht_mean %>%
# Identify simulated test statistics at least as extreme as observed
filter(stat > diff_mean) %>%
# Calculate p-value
summarize(p_val = (n() / 1000))
## # A tibble: 1 x 1
## p_val
## <dbl>
## 1 0
# Remove subjects with missing habit
ncbirths_complete_habit <- ncbirths %>%
filter(!is.na(habit))
# Calculate observed difference in means
diff_mean <- ncbirths_complete_habit %>%
# Group by habit group
group_by(habit) %>%
# Calculate mean weight for each group
summarize(mean_weight = mean(weight)) %>%
# Extract
pull() %>%
# Calculate difference
diff()
# Generate 1000 differences in means via randomization
diff_ht_mean <- ncbirths_complete_habit %>%
# y ~ x
specify(weight ~ habit) %>%
# Null = no difference between means
hypothesize(null = "independence") %>%
# Shuffle labels 1000 times
generate(reps = 1000, type = "permute") %>%
# Calculate test statistic
calculate(stat = "diff in means", order=rev(levels(ncbirths_complete_habit$habit)))
# Calculate p-value
diff_ht_mean %>%
# Identify simulated test statistics at least as extreme as observed
filter(stat < diff_mean) %>%
# Calculate p-value
summarize(p_val = (n()/1000) * 2)
## # A tibble: 1 x 1
## p_val
## <dbl>
## 1 0.0280
# Generate 1500 bootstrap difference in means
diff_mean_ci <- ncbirths_complete_habit %>%
specify(weight ~ habit) %>%
generate(reps = 1500, type = "bootstrap") %>%
calculate(stat = "diff in means", order=rev(levels(ncbirths_complete_habit$habit)))
# Calculate the 95% CI via percentile method
diff_mean_ci %>%
summarize(l=quantile(stat, 0.025),
u=quantile(stat, 0.975))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 -0.583 -0.0530
# Remove subjects with missing habit and weeks
ncbirths_complete_habit_weeks <- ncbirths %>%
filter(!is.na(habit) & !is.na(weeks))
# Generate 1500 bootstrap difference in medians
diff_med_ci <- ncbirths_complete_habit_weeks %>%
specify(weeks ~ habit) %>%
generate(reps = 1500, type = "bootstrap") %>%
calculate(stat="diff in medians", order=rev(levels(ncbirths_complete_habit_weeks$habit)))
# Calculate the 92% CI via percentile method
diff_med_ci %>%
summarize(l=quantile(stat, 0.04),
u=quantile(stat, 0.96))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 -1.00 0
# Create hrly_pay and filter for non-missing hrly_pay and citizen
acs12_complete_hrlypay_citizen <- acs12 %>%
mutate(hrly_pay = income / (hrs_work * 52)) %>%
filter(
!is.na(hrly_pay),
!is.na(citizen)
)
# Calculate percent missing
new_n <- nrow(acs12_complete_hrlypay_citizen)
old_n <- nrow(acs12)
(perc_missing <- (old_n - new_n) / old_n)
## [1] 0.5205
# Calculate summary statistics
acs12_complete_hrlypay_citizen %>%
group_by(citizen) %>%
summarize(
x_bar = mean(hrly_pay),
s = sd(hrly_pay),
n = n()
)
## # A tibble: 2 x 4
## citizen x_bar s n
## <fct> <dbl> <dbl> <int>
## 1 no 21.2 34.5 58
## 2 yes 18.5 24.7 901
# Plot the distributions
ggplot(data = acs12_complete_hrlypay_citizen, mapping = aes(x = hrly_pay)) +
geom_histogram(binwidth = 5) +
facet_grid(. ~ citizen, labeller = labeller(citizen = c(no = "Non citizen",
yes = "Citizen")))
# Construct 95% CI
t.test(hrly_pay ~ citizen, data=acs12_complete_hrlypay_citizen, null=0, alternative="two.sided")
##
## Welch Two Sample t-test
##
## data: hrly_pay by citizen
## t = 0.58058, df = 60.827, p-value = 0.5637
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.53483 11.88170
## sample estimates:
## mean in group no mean in group yes
## 21.19494 18.52151
Chapter 4 - Comparing Many Means
Vocabulary score vary between social class:
ANOVA - Analysis of Variance:
Conditions for ANOVA:
Post-hoc testing - determining which of the means are different:
Wrap-up:
Example code includes:
gss <- readr::read_csv("./RInputFiles/gss_wordsum_class.csv")
## Parsed with column specification:
## cols(
## wordsum = col_integer(),
## class = col_character()
## )
str(gss)
## Classes 'tbl_df', 'tbl' and 'data.frame': 795 obs. of 2 variables:
## $ wordsum: int 6 9 6 5 6 6 8 10 8 9 ...
## $ class : chr "MIDDLE" "WORKING" "WORKING" "WORKING" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ wordsum: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ class : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
ggplot(gss, aes(x=wordsum)) +
geom_histogram(binwidth=1) +
facet_grid(class ~ .)
aov_wordsum_class <- aov(wordsum ~ class, data=gss)
broom::tidy(aov_wordsum_class)
## term df sumsq meansq statistic p.value
## 1 class 3 236.5644 78.854810 21.73467 1.560565e-13
## 2 Residuals 791 2869.8003 3.628066 NA NA
gss %>%
group_by(class) %>%
summarize(s = sd(wordsum))
## # A tibble: 4 x 2
## class s
## <chr> <dbl>
## 1 LOWER 2.24
## 2 MIDDLE 1.89
## 3 UPPER 2.34
## 4 WORKING 1.87
# Conduct the pairwise.t.test with p.adjust = "none" option (we'll adjust the significance level, not the p-value). The first argument is the response vector and the second argument is the grouping vector.
pairwise.t.test(gss$wordsum, gss$class, p.adjust = "none") %>%
broom::tidy()
## group1 group2 p.value
## 1 MIDDLE LOWER 1.133345e-07
## 2 UPPER LOWER 4.752521e-02
## 3 WORKING LOWER 3.055619e-02
## 5 UPPER MIDDLE 2.395734e-01
## 6 WORKING MIDDLE 1.631637e-12
## 9 WORKING UPPER 3.670775e-01
Chapter 1 - Introduction to Correlation Coefficients
How are correlation coefficients calculated?
Usefulness of correlation coefficients:
Points of caution:
Example code includes:
PE <- read.table("http://assets.datacamp.com/course/Conway/Lab_Data/Stats1.13.Lab.04.txt", header=TRUE)
# Take a quick peek at both vectors
(A <- c(1, 2, 3))
## [1] 1 2 3
(B <- c(3, 6, 7))
## [1] 3 6 7
# Save the differences of each vector element with the mean in a new variable
diff_A <- A - mean(A)
diff_B <- B - mean(B)
# Do the summation of the elements of the vectors and divide by N-1 in order to acquire the covariance between the two vectors
cov <- sum(diff_A*diff_B)/ (length(A)-1)
# Square the differences that were found in the previous step
sq_diff_A <- diff_A ** 2
sq_diff_B <- diff_B ** 2
# Take the sum of the elements, divide them by N-1 and consequently take the square root to acquire the sample standard deviations
sd_A <- sqrt(sum(sq_diff_A)/(length(A)-1))
sd_B <- sqrt(sum(sq_diff_B)/(length(B)-1))
# Combine all the pieces of the puzzle
correlation <- cov / (sd_A * sd_B)
correlation
## [1] 0.9607689
# Check the validity of your result with the cor() command
cor(A, B)
## [1] 0.9607689
# Read data from a URL into a dataframe called PE (physical endurance) - moved above to cache
# PE <- read.table("http://assets.datacamp.com/course/Conway/Lab_Data/Stats1.13.Lab.04.txt", header=TRUE)
# Summary statistics
psych::describe(PE)
## vars n mean sd median trimmed mad min max range skew
## pid 1 200 101.81 58.85 101.5 101.71 74.87 1 204 203 0.01
## age 2 200 49.41 10.48 48.0 49.46 10.38 20 82 62 0.06
## activeyears 3 200 10.68 4.69 11.0 10.57 4.45 0 26 26 0.30
## endurance 4 200 26.50 10.84 27.0 26.22 10.38 3 55 52 0.22
## kurtosis se
## pid -1.21 4.16
## age -0.14 0.74
## activeyears 0.46 0.33
## endurance -0.44 0.77
# Scatter plots
plot(PE$age ~ PE$activeyears)
plot(PE$endurance ~ PE$activeyears)
plot(PE$endurance ~ PE$age)
# Correlation Analysis
round(cor(PE[, !(names(PE) == "pid")]), 2)
## age activeyears endurance
## age 1.00 0.33 -0.08
## activeyears 0.33 1.00 0.33
## endurance -0.08 0.33 1.00
# Do some correlation tests. If the null hypothesis of no correlation can be rejected on a significance level of 5%, then the relationship between variables is significantly different from zero at the 95% confidence level
cor.test(PE$age, PE$activeyears)
##
## Pearson's product-moment correlation
##
## data: PE$age and PE$activeyears
## t = 4.9022, df = 198, p-value = 1.969e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1993491 0.4473145
## sample estimates:
## cor
## 0.3289909
cor.test(PE$endurance, PE$activeyears)
##
## Pearson's product-moment correlation
##
## data: PE$endurance and PE$activeyears
## t = 4.8613, df = 198, p-value = 2.37e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1967110 0.4451154
## sample estimates:
## cor
## 0.3265402
cor.test(PE$endurance, PE$age)
##
## Pearson's product-moment correlation
##
## data: PE$endurance and PE$age
## t = -1.1981, df = 198, p-value = 0.2323
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.22097811 0.05454491
## sample estimates:
## cor
## -0.08483813
# The impact dataset is already loaded in
rawImpactData <- " 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, 95, 90, 87, 84, 92, 89, 78, 97, 93, 90, 89, 97, 79, 86, 85, 85, 98, 95, 96, 92, 79, 85, 97, 89, 75, 75, 84, 93, 88, 97, 93, 96, 84, 89, 95, 95, 97, 95, 92, 95, 88, 82, 77, 72, 77, 79, 63, 82, 85, 66, 76, 79, 60, 59, 60, 76, 85, 83, 67, 84, 81, 85, 91, 74, 63, 68, 78, 74, 80, 73, 74, 70, 81, 72, 90, 74, 70, 63, 65, 69, 35.29, 31.47, 30.87, 41.87, 33.28, 40.73, 38.09, 31.65, 39.59, 30.53, 33.65, 37.51, 40.39, 32.88, 33.39, 35.13, 38.51, 29.64, 35.32, 27.36, 27.19, 32.66, 26.29, 28.92, 32.77, 32.92, 34.26, 36.08, 31.63, 28.89, 35.81, 33.61, 34.46, 39.18, 33.14, 33.03, 39.01, 35.06, 30.58, 38.45, 0.42, 0.63, 0.56, 0.66, 0.56, 0.81, 0.66, 0.79, 0.68, 0.60, 0.74, 0.51, 0.82, 0.59, 0.82, 0.63, 0.73, 0.57, 0.65, 1.00, 0.57, 0.71, 0.82, 0.61, 0.72, 0.50, 0.54, 0.65, 0.66, 0.71, 0.55, 0.79, 0.48, 0.55, 1.20, 0.73, 0.60, 0.84, 0.60, 0.42, 11, 7, 8, 7, 7, 6, 6, 10, 7, 10, 7, 7, 12, 2, 9, 10, 10, 8, 5, 11, 7, 9, 9, 9, 8, 9, 6, 10, 9, 7, 9, 7, 7, 10, 10, 11, 10, 5, 8, 11, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 97, 86, 90, 85, 87, 91, 90, 94, 91, 93, 92, 89, 84, 81, 85, 87, 96, 93, 95, 93, 63, 79, 91, 85, 74, 72, 80, 59, 75, 90, 66, 85, 72, 82, 80, 59, 74, 62, 67, 66, 86, 80, 79, 70, 77, 85, 60, 72, 83, 68, 72, 79, 67, 71, 61, 72, 78, 85, 67, 80, 75, 79, 80, 72, 56, 66, 74, 69, 79, 73, 69, 61, 79, 66, 80, 70, 62, 54, 57, 63, 35.61, 37.01, 20.15, 33.26, 28.34, 33.47, 44.28, 36.14, 37.42, 25.19, 23.63, 26.32, 43.70, 32.40, 39.32, 35.62, 39.95, 35.62, 30.21, 30.37, 29.23, 44.45, 26.12, 27.98, 60.77, 31.91, 49.62, 35.68, 55.67, 25.70, 35.21, 33.01, 37.46, 53.20, 33.20, 34.59, 39.66, 35.09, 32.30, 44.49, 0.65, 0.49, 0.75, 0.19, 0.59, 0.48, 0.77, 0.90, 0.65, 0.59, 0.55, 0.56, 0.57, 0.69, 0.73, 0.48, 0.43, 0.37, 0.47, 0.50, 0.61, 0.65, 1.12, 0.65, 0.71, 0.79, 0.64, 0.70, 0.68, 0.73, 0.58, 0.97, 0.56, 0.51, 1.30, 0.70, 0.74, 1.24, 0.65, 0.98, 10, 7, 9, 8, 8, 5, 6, 10, 8, 11, 9, 9, 10, 3, 10, 12, 10, 9, 5, 11, 3, 6, 5, 5, 1, 9, 7, 11, 6, 3, 4, 3, 1, 7, 7, 4, 5, 2, 6, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 26, 34, 27, 22, 26, 35, 43, 31, 39, 25, 31, 38, 14, 16, 33, 13, 27, 15, 19, 39"
rawImpactNames <- c('subject', 'condition', 'vermem1', 'vismem1', 'vms1', 'rt1', 'ic1', 'sym1', 'vermem2', 'vismem2', 'vms2', 'rt2', 'ic2', 'sym2')
splitImpactData <- stringr::str_split(rawImpactData, ",")
impactRawMatrix <- matrix(data=splitImpactData[[1]], ncol=length(rawImpactNames))
colnames(impactRawMatrix) <- rawImpactNames
rawImpactDF <- as.data.frame(impactRawMatrix, stringsAsFactors=FALSE)
for (intCtr in c(1, 3:ncol(rawImpactDF))) { rawImpactDF[, intCtr] <- as.numeric(rawImpactDF[, intCtr]) }
rawImpactDF$condition <- factor(stringr::str_replace_all(rawImpactDF$condition, " ", ""))
impact <- rawImpactDF
# Summary statistics entire dataset
psych::describe(impact)
## vars n mean sd median trimmed mad min max range
## subject 1 40 20.50 11.69 20.50 20.50 14.83 1.00 40.00 39.00
## condition* 2 40 1.50 0.51 1.50 1.50 0.74 1.00 2.00 1.00
## vermem1 3 40 89.75 6.44 91.00 90.44 6.67 75.00 98.00 23.00
## vismem1 4 40 74.88 8.60 75.00 74.97 9.64 59.00 91.00 32.00
## vms1 5 40 34.03 3.90 33.50 34.02 3.62 26.29 41.87 15.58
## rt1 6 40 0.67 0.15 0.65 0.66 0.13 0.42 1.20 0.78
## ic1 7 40 8.28 2.05 8.50 8.38 2.22 2.00 12.00 10.00
## sym1 8 40 0.05 0.22 0.00 0.00 0.00 0.00 1.00 1.00
## vermem2 9 40 82.00 11.02 85.00 82.97 9.64 59.00 97.00 38.00
## vismem2 10 40 71.90 8.42 72.00 72.19 10.38 54.00 86.00 32.00
## vms2 11 40 35.83 8.66 35.15 34.98 6.89 20.15 60.77 40.62
## rt2 12 40 0.67 0.22 0.65 0.65 0.13 0.19 1.30 1.11
## ic2 13 40 6.75 2.98 7.00 6.81 2.97 1.00 12.00 11.00
## sym2 14 40 13.88 15.32 7.00 12.38 10.38 0.00 43.00 43.00
## skew kurtosis se
## subject 0.00 -1.29 1.85
## condition* 0.00 -2.05 0.08
## vermem1 -0.70 -0.51 1.02
## vismem1 -0.11 -0.96 1.36
## vms1 0.08 -0.75 0.62
## rt1 1.14 2.21 0.02
## ic1 -0.57 0.36 0.32
## sym1 3.98 14.16 0.03
## vermem2 -0.65 -0.81 1.74
## vismem2 -0.28 -0.87 1.33
## vms2 0.86 0.65 1.37
## rt2 0.93 1.29 0.03
## ic2 -0.16 -1.06 0.47
## sym2 0.44 -1.47 2.42
# Calculate correlation coefficient
entirecorr <- round(cor(impact$vismem2, impact$vermem2), 2)
# Summary statistics subsets
psych::describeBy(impact, impact$condition)
##
## Descriptive statistics by group
## group: concussed
## vars n mean sd median trimmed mad min max range
## subject 1 20 30.50 5.92 30.50 30.50 7.41 21.00 40.00 19.00
## condition* 2 20 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00
## vermem1 3 20 89.65 7.17 92.50 90.56 5.93 75.00 97.00 22.00
## vismem1 4 20 74.75 8.03 74.00 74.25 8.15 63.00 91.00 28.00
## vms1 5 20 33.20 3.62 33.09 33.27 3.32 26.29 39.18 12.89
## rt1 6 20 0.66 0.17 0.63 0.64 0.13 0.42 1.20 0.78
## ic1 7 20 8.55 1.64 9.00 8.62 1.48 5.00 11.00 6.00
## sym1 8 20 0.05 0.22 0.00 0.00 0.00 0.00 1.00 1.00
## vermem2 9 20 74.05 9.86 74.00 73.88 11.86 59.00 91.00 32.00
## vismem2 10 20 69.20 8.38 69.50 69.62 10.38 54.00 80.00 26.00
## vms2 11 20 38.27 10.01 35.15 37.32 7.73 25.70 60.77 35.07
## rt2 12 20 0.78 0.23 0.70 0.74 0.11 0.51 1.30 0.79
## ic2 13 20 5.00 2.53 5.00 4.88 2.97 1.00 11.00 10.00
## sym2 14 20 27.65 9.07 27.00 27.75 11.12 13.00 43.00 30.00
## skew kurtosis se
## subject 0.00 -1.38 1.32
## condition* NaN NaN 0.00
## vermem1 -0.79 -0.70 1.60
## vismem1 0.45 -0.72 1.80
## vms1 -0.13 -0.78 0.81
## rt1 1.38 2.41 0.04
## ic1 -0.39 -0.81 0.37
## sym1 3.82 13.29 0.05
## vermem2 0.07 -1.24 2.21
## vismem2 -0.27 -1.26 1.87
## vms2 0.77 -0.57 2.24
## rt2 1.09 -0.10 0.05
## ic2 0.39 -0.28 0.57
## sym2 -0.11 -1.25 2.03
## --------------------------------------------------------
## group: control
## vars n mean sd median trimmed mad min max range skew
## subject 1 20 10.50 5.92 10.50 10.50 7.41 1.00 20.00 19.00 0.00
## condition* 2 20 2.00 0.00 2.00 2.00 0.00 2.00 2.00 0.00 NaN
## vermem1 3 20 89.85 5.82 90.00 90.31 7.41 78.00 98.00 20.00 -0.41
## vismem1 4 20 75.00 9.34 77.00 75.50 9.64 59.00 88.00 29.00 -0.46
## vms1 5 20 34.86 4.09 34.39 34.85 4.92 27.36 41.87 14.51 0.09
## rt1 6 20 0.67 0.13 0.66 0.67 0.13 0.42 1.00 0.58 0.47
## ic1 7 20 8.00 2.41 7.50 8.12 2.22 2.00 12.00 10.00 -0.41
## sym1 8 20 0.05 0.22 0.00 0.00 0.00 0.00 1.00 1.00 3.82
## vermem2 9 20 89.95 4.36 90.50 90.06 5.19 81.00 97.00 16.00 -0.25
## vismem2 10 20 74.60 7.76 74.50 75.00 8.15 60.00 86.00 26.00 -0.23
## vms2 11 20 33.40 6.44 34.54 33.52 6.30 20.15 44.28 24.13 -0.25
## rt2 12 20 0.57 0.16 0.56 0.57 0.13 0.19 0.90 0.71 -0.16
## ic2 13 20 8.50 2.31 9.00 8.69 1.48 3.00 12.00 9.00 -0.73
## sym2 14 20 0.10 0.31 0.00 0.00 0.00 0.00 1.00 1.00 2.47
## kurtosis se
## subject -1.38 1.32
## condition* NaN 0.00
## vermem1 -0.87 1.30
## vismem1 -1.27 2.09
## vms1 -1.19 0.91
## rt1 -0.02 0.03
## ic1 -0.17 0.54
## sym1 13.29 0.05
## vermem2 -1.02 0.97
## vismem2 -1.11 1.73
## vms2 -0.77 1.44
## rt2 0.06 0.04
## ic2 -0.32 0.52
## sym2 4.32 0.07
# Create 2 subsets: control and concussed
control <- subset(impact, condition == "control")
concussed <- subset(impact, condition == "concussed")
# Calculate correlation coefficients for each subset
controlcorr <- round(cor(control$vismem2, control$vermem2), 2)
concussedcorr <- round(cor(concussed$vismem2, concussed$vermem2), 2)
# Display all values at the same time
correlations <- cbind(entirecorr, controlcorr, concussedcorr)
correlations
## entirecorr controlcorr concussedcorr
## [1,] 0.45 0.37 0.35
Chapter 2 - Introduction to Linear Regression
Introduction to regression:
Regression equations and the R-squared value:
Multiple linear regression:
Example code includes:
# Look at the dataset. Note that the variables we are interested in are on the 9th to 14th columns
head(impact)
## subject condition vermem1 vismem1 vms1 rt1 ic1 sym1 vermem2 vismem2
## 1 1 control 95 88 35.29 0.42 11 0 97 86
## 2 2 control 90 82 31.47 0.63 7 0 86 80
## 3 3 control 87 77 30.87 0.56 8 0 90 79
## 4 4 control 84 72 41.87 0.66 7 0 85 70
## 5 5 control 92 77 33.28 0.56 7 1 87 77
## 6 6 control 89 79 40.73 0.81 6 0 91 85
## vms2 rt2 ic2 sym2
## 1 35.61 0.65 10 0
## 2 37.01 0.49 7 0
## 3 20.15 0.75 9 0
## 4 33.26 0.19 8 0
## 5 28.34 0.59 8 1
## 6 33.47 0.48 5 0
# Create a correlation matrix for the dataset
correlations <- cor(impact[, 9:14])
# Create the scatterplot matrix for the dataset
corrplot::corrplot(correlations)
# Calculate the required means, standard deviations and correlation coefficient
mean_sym2 <- mean(impact$sym2)
mean_ic2 <- mean(impact$ic2)
sd_sym2 <- sd(impact$sym2)
sd_ic2 <- sd(impact$ic2)
r <- cor(impact$ic2,impact$sym2)
# Calculate the slope
B_1 <- r * ( sd_sym2 )/( sd_ic2 )
# Calculate the intercept
B_0 <- mean_sym2 - B_1 * mean_ic2
# Plot of ic2 against sym2
plot(x=impact$ic2, y=impact$sym2, main = "Scatterplot", ylab = "Symptoms", xlab = "Impulse Control")
# Add the regression line
abline(B_0, B_1, col = "red")
# Construct the regression model
model_1 <- lm(impact$sym2 ~ impact$ic2)
# Look at the results of the regression by using the summary function
summary(model_1)
##
## Call:
## lm(formula = impact$sym2 ~ impact$ic2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.441 -8.983 -5.309 9.127 29.696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.2945 5.5090 5.318 4.9e-06 ***
## impact$ic2 -2.2844 0.7483 -3.053 0.00413 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.91 on 38 degrees of freedom
## Multiple R-squared: 0.1969, Adjusted R-squared: 0.1758
## F-statistic: 9.319 on 1 and 38 DF, p-value: 0.004125
# Create a scatter plot of Impulse Control against Symptom Score
plot(impact$sym2 ~ impact$ic2, main = "Scatterplot", ylab = "Symptoms", xlab = "Impulse Control")
# Add a regression line
abline(model_1, col = "red")
# Multiple Regression
model_2 <- lm(impact$sym2 ~ impact$ic2 + impact$vermem2)
# Examine the results of the regression
summary(model_2)
##
## Call:
## lm(formula = impact$sym2 ~ impact$ic2 + impact$vermem2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.274 -8.031 -2.703 6.245 27.962
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.7639 14.7765 5.398 4.1e-06 ***
## impact$ic2 -1.0711 0.7335 -1.460 0.152690
## impact$vermem2 -0.7154 0.1981 -3.611 0.000898 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.12 on 37 degrees of freedom
## Multiple R-squared: 0.4062, Adjusted R-squared: 0.3742
## F-statistic: 12.66 on 2 and 37 DF, p-value: 6.482e-05
# Extract the predicted values
predicted <- fitted(model_2)
# Plotting predicted scores against observed scores
plot(predicted ~ impact$sym2, main = "Scatterplot", xlab = "Observed Scores", ylab = "Predicted Scores")
abline(lm(predicted ~ impact$sym2), col = "green")
Chapter 3 - Linear Regression Models (cont)
Estimation of coefficients - key concept is to minimize the residuals (specifically, residuals-squared):
Estimation of standardized and unstandardized regression coefficients:
Assumptions of linear regression:
Anscombe’s quartet:
Example code includes:
# Create a linear regression with `ic2` and `vismem2` as regressors
model_1 <- lm(impact$sym2 ~ impact$ic2 + impact$vismem2)
# Extract the predicted values
predicted_1 <- fitted(model_1)
# Calculate the squared deviation of the predicted values from the observed values
deviation_1 <- (impact$sym2 - predicted_1) ** 2
# Sum the squared deviations
SSR_1 <- sum(deviation_1)
SSR_1
## [1] 7236.338
# Create a linear regression with `ic2` and `vermem2` as regressors
model_2 <- lm(impact$sym2 ~ impact$ic2 + impact$vermem2)
# Extract the predicted values
predicted_2 <- fitted(model_2)
# Calculate the squared deviation of the predicted values from the observed values
deviation_2 <- (impact$sym2 - predicted_2) ** 2
# Sum the squared deviations
SSR_2 <- sum(deviation_2)
SSR_2
## [1] 5435.454
# Create a standardized simple linear regression
model_1_z <- lm(scale(impact$sym2) ~ scale(impact$ic2))
#Look at the output of this regression model
summary(model_1_z)
##
## Call:
## lm(formula = scale(impact$sym2) ~ scale(impact$ic2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4648 -0.5863 -0.3465 0.5958 1.9383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.487e-16 1.435e-01 0.000 1.00000
## scale(impact$ic2) -4.438e-01 1.454e-01 -3.053 0.00413 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9078 on 38 degrees of freedom
## Multiple R-squared: 0.1969, Adjusted R-squared: 0.1758
## F-statistic: 9.319 on 1 and 38 DF, p-value: 0.004125
# Extract the R-Squared value for this regression
r_square_1 <- summary(model_1_z)$r.square
#Calculate the correlation coefficient
corr_coef_1 <- sqrt(r_square_1)
# Create a standardized multiple linear regression
model_2_z <- lm(scale(impact$sym2) ~ scale(impact$ic2) + scale(impact$vismem2))
# Look at the output of this regression model
summary(model_2_z)
##
## Call:
## lm(formula = scale(impact$sym2) ~ scale(impact$ic2) + scale(impact$vismem2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4349 -0.5949 -0.3174 0.5331 1.9646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.450e-16 1.443e-01 0.000 1.0000
## scale(impact$ic2) -4.101e-01 1.526e-01 -2.688 0.0107 *
## scale(impact$vismem2) -1.171e-01 1.526e-01 -0.767 0.4479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9128 on 37 degrees of freedom
## Multiple R-squared: 0.2095, Adjusted R-squared: 0.1668
## F-statistic: 4.904 on 2 and 37 DF, p-value: 0.01291
# Extract the R-Squared value for this regression
r_square_2 <- summary(model_2_z)$r.squared
# Calculate the correlation coefficient
corr_coef_2 <- sqrt(r_square_2)
# Extract the residuals from the model
residual <- resid(model_2)
# Draw a histogram of the residuals
hist(residual)
# Extract the predicted symptom scores from the model
predicted <- fitted(model_2)
# Plot the residuals against the predicted symptom scores
plot(residual ~ predicted, main = "Scatterplot", xlab="Model 2 Predicted Scores", ylab="Model 2 Residuals" )
abline(lm(residual ~ predicted), col="red")
Chapter 1 - Inferential Ideas
Variability in regression lines:
Research question - linear modeling for relationships between fat, carbohydrates, and calories in Starbucks food:
Variability of coefficients:
Example code includes:
# Load the mosaicData package and the RailTrail data
library(mosaicData)
data(RailTrail)
# Fit a linear model
ride_lm <- lm(volume ~ hightemp, data=RailTrail)
# View the summary of your model
summary(ride_lm)
##
## Call:
## lm(formula = volume ~ hightemp, data = RailTrail)
##
## Residuals:
## Min 1Q Median 3Q Max
## -254.562 -57.800 8.737 57.352 314.035
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.079 59.395 -0.288 0.774
## hightemp 5.702 0.848 6.724 1.71e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 104.2 on 88 degrees of freedom
## Multiple R-squared: 0.3394, Adjusted R-squared: 0.3319
## F-statistic: 45.21 on 1 and 88 DF, p-value: 1.705e-09
# Print the tidy model output
ride_lm %>% broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -17.079281 59.3953040 -0.2875527 7.743652e-01
## 2 hightemp 5.701878 0.8480074 6.7238541 1.705138e-09
expData1 <- c(-4.3, 0.19, -2.59, -0.43, 0.59, -2.74, 3.09, 3.51, 0.56, 5.89, 0.36, -0.01, 2.59, 1.51, 2.89, -8.26, -0.46, 3.28, 4.85, 1.16, 3.03, 2.24, 1.78, -0.26, 4.29, 6.92, -6.34, 0.49, 3.4, 3.08, 2.1, -1.93, 3.72, 0.52, -4.65, 4.24, -1.21, 5.15, -10.43, 6.46, -2.78, 0.7, 2.93, -4.84, -7.08, -3.98, 8.27, -4.51, -5.22, -2.17, 2.32, 0.37, -2.53, 3.2, -8.02, -1.82, -6.17, 1.45, -0.19, -0.91, -2.02, 1.13, 11.2, 4.43, 0.88, -0.28, -9.29, 0.18, -6.9, 0.44, -9.1, -1.21, 11.32, -3.3, 3.56, 1.28, 5.76, -2.73, -9.69, -4.43, 5.71, 1.09, -8.28, -7.12, -0.33, -4.3, 4.16, 4.83, -0.29, -3.78, 5.03, 12.3, 4.79, 0.69, -11.06, 3.73, -6.64, -0.24, 5.08, -0.48, 0.68, 4.43, 2.11, 1.8, 2.98, -4.84, -3.9, 4.1, 0.05, -7.43, -2.41, 1.14, -1.87, 11.12, 6.26, 1.29, -4.54, 5.38, 3.09, -4.59, 8.55, -4.21, -0.92, 0.79, -3.48, -6.13, 3.58, 4.54, -4.83, -13.5, 1.58, -1.03, 1.34, -1.46, 5.53, -4.23, -6.95, 6.17, -0.89, 9.95, -4.12, 0.08, 2.49, -8.42, -2.4, -6.96, 7.92, -5.04, -0.25, -0.63, 8.4, 4.18, -4.86, 0.99, -5.54, -4.23, -2.23, 2.21, -0.05, -2.67, -1.14, 3.3, -5.48, 3.86, 2.1, 4.81, -1.09, -10.97, -16.68, -8.58, 3.78, 5.94, 0.35, 0.14, -8.6, -3.44, -5.14, -6.65, -0.49, -1.99, 3.54, 4.7, -0.61, 8.69, 0.91, 0.71, 3.6, -3.1, -2.99, 5.82, 3.84, 0.82, -2.74, -6.27, -3.03, 1.29, 1.58, 1.76, 4.64, -7.24, 1.54, 0.83, -0.6, -0.29, 0.78, -8.42, 9.76, 14.35, -1.09, -13.42, -1.72, 4.49, -0.02, -0.47, 8.93, 5.27, -6.06, 12.66, 0.53, -3.08, 0.52, -0.71, -0.39, -1.11, -1.72, 8.66, -1.41, 2.77, 1.03, -6.97, 7.57, -10.75, -0.88, -2.53, 1.64, 6.48, -1.61, -1.98, -5.91, 7.25, -1.67, 4.26, -7.22, 6.03, 2.92, 4.08, 9.65, -12.34, 1.24, 3.76, 3.25, -9.13, -3.23, 0.51, -1.52, -3.44, 6.75, -0.18, -3.92, -4.14, 1.14, 6.44, -0.32, 5.91, -3.55, -8.99, -6.38, -2.64, -1.47, -3.91, 12.07, 5.55, -7.94, 10.98, -6.57, -3.43, -1.13, 9.51, 11.19, -3.21, -3.19, -7.94, 2.4)
expData2 <- c(-4.59, 6.5, 3.8, -6.42, 0.78, -2.4, -2.55, -3.2, -6.3, 4.69, -0.05, 5.71, 6.5, 3.69, -4.75, 4.87, -2.42, -5.04, 3.75, 1.69, -0.19, 8.33, 2.8, -0.09, 6.24, -3.73, -2.64, 8.11, -4.43, 4.42, 3.46, -6.71, -5.47, 6.84, 4.94, 2.23, 0.92, 1.56, -3.52, -5.42, -1.04, -4.33, -0.63, -1.72, -5.42, -8.92, -4.8, -6.53, 3.33, 3.39, 4.08, -3.03, -5.11, 7.04, -0.93, -2.56, -1.45, 8.75, -4.01, -5.87, 3.36, 5.83, 1.13, -1.25, -0.04, 0.23, 0.95, 3.16, -7.17, 12.37, -9.98, -9.73, 1.55, -8.56, 13.58, 0.56, 6.39, 2.34, -5.11, 6.48, -1.62, -1.16, -6.37, 7.48, 3.51, 4.82, 1.73, -0.48, -0.84, 2.58, -3.24, -1.33, 4.69, -0.99, 9.78, -16.75, -2.92, 10.15, -4.64, 5.66, 0.89, 2.11, 1.66, 3.78, 3.43, -1.09, -1.43, -10.07, -0.87, 4.41, -3.55, -1.66, 8.28, 8.3, 1.03, 6.42, -0.33, -2.63, -4.12, 6.68, -1.32, 10.69, 7.11, -3.75, 1.16, 5.19, -4.41, -4.13, -3.32, -8.24, -3.19, 1.1, 5.45, 2.19, -10.27, -0.87, -1.32, -2.77, 7.39, -14.48, -2.06, -3.46, -4.21, -6.55, -1.59, -0.44, -3.11, -4.21, -8.38, 0.01, 10.58, 3.05, 3.67, -2.52, 2.05, -2, 7.04, -0.42, -12.23, -0.44, -1.66, -1.31, -0.16, 1.72, -3.25, 2.56, -0.21, -1.59, 2.35, -2.5, 0.44, 8.61, 2.83, 10.75, 1.1, -0.89, 4.89, -0.91, 1.83, 3.2, -1.16, -3.23, 0.96, 2.59, 6.36, -0.53, -4.2, -1.13, 2.37, -1.06, -3.69, -0.25, 8.21, -5.84, -5.53, -3.03, -0.79, -0.72, -0.67, 3.23, -6.51, 2.06, -0.4, 0.75, -2.39, -2.27, -3.65, -7.56, 3.24, -4.05, -4.2, -5.91, 5.24, -11.65, -4.16, -5.99, 1.22, 1.32, -3.63, -0.9, -3.52, -5.25, 8.05, 4.09, -3.22, 5.71, 0.67, -5.46, -5.24, -1.7, -6.4, 0.48, 4.49, 15.97, -1.42, 2.41, -1.75, 4.77, -4.45, 0.88, 0.24, 11.64, -0.51, 1.58, 4.18, -3.51, 2.32, -2.15, -5.42, 5.6, 4.18, -4.82, -1.41, -5.32, 0.58, 1.23, -5.35, -5.88, 0.76, -2.81, 0.59, -2.26, 4.05, 0.32, 5.97, 4.22, -1.79, 3.28, -4.16, -4.88, -1.24, -7.38, -2.67, -4.56, 2.45, 4.92, 1.84, -1.6, 4.79, -4.02, -9.2, 6.78, -8.21, -0.18, -4.02, 4.84, 2.81, -2.65, -4.72, -0.83, -4.69, 7.94, 3.53, 4.25, 5.06, 7.88, -1.08, -0.78, 3.41, -10.45, 0.16, 0.13, -0.6, 1.82, 5.68, 5.7, 4.66, -5.4, 7.12, -2.49, 1.5, 1.27, -8.26, 0.58, 0.04, 3.17, -3.23, -0.66, -3.2, 1.59, -4, -1.96, -3.48, -3.4, -3.95, 4.52, 2.5, -3.37, -14.81, -3.22, -3.57, 2.44, 0.17, 4.8, -6.15, -3.4, -4.1, -2.68, 5.86, 2.92, -0.19, -4.64, 9.4, 6.49, -5.84, -6.62, 2.86, -3.56, -4.6, -4.87, 7.32, 3.82, 8.99, -1.46, 4.98, -1.41, -5.89, -8.86, 6.87, -7.25, 2.67, 2.81, -0.22, -3.37, 6.74, 3.33, 4.72, 1.02, -3.02, -4.9, 2, 3.41, 0.5, -7.36, 6.36, 4, 2.24, -6.75, -5.62, -8.14, 1.82, 6.23, -0.18, 10.71, -0.57, 1.38, 9.5, 1.12, 3.08, 0.08, -4.75, 4.23, -2.23, -0.82, 1.84, -1.15, 4.12, -5.86, -0.16, -6.5, 4.86)
expData3 <- c(-0.62, 1.5, 5.44, -1.68, -10.04, 11.49, 1.48, -1.82, 1.57, 3.06, -2.36, -7.98, 0.25, -1.77, 3.32, 1.72, -7.55, 7.24, 2.78, -4.41, -2.55, -1.3, -1.49, 2.78, -4.37, -4.41, 0.57, 0.7, -0.56, 0.17, -2.52, 0.5, 2.46, -5.55, 2.98, 0.51, -0.28, 3.97, 6.74, 0.14, 3.54, 0.38, -2.69, 1.59, 3.09, -2.73, 4.93, 7.43, 1.76, 0.77, 4.54, 3.69, 5.75, -2.68, -1.01, 6.47, 1.91, -3.48, -2.91, 3.62, -3.72, 2.09, 0.63, -6.95, -0.66, -8.25, 6.6, -3.02, 3.51, 11.77, -1.78, -1.57, 5.58, -0.44, 3.07, -2.54, -3.1, 3.77, 8.05, -2.44, -0.95, 3.73, 1.64, 7.64, 3.63, 3.39, 1.71, -6.25, -3.47, 1.6, 3.49, 0.94, 0.18, -4.29, -2.62, 14.57, -1.73, 1.79, 2.54, -2.94, -0.56, 6.87, -4.81, 6.45, 4.2, 1.65, 8.4, 7.45, 7.11, 5.56, 1.06, -8.52, -7.68, -6.63, -4.09, 0.16, -6.08, -5.78, -4.46, -1.35, 3.34, -0.51, -3.65, -3.82, 0.64, 8.2, 14.07, -0.87, 3.3, 1.7, -3.17, -0.57, -1.06, 5.74, 0.79, -5.42, -2.22, 3.72, 2.88, -5.73, 0.82, -3.04, 6.11, 7.04, 2.84, 0.29, -2.37, 4.49, -5, -4.09, 0.33, 0.34, 0.81, 2.11, -1.55, -0.75, -7.49, 6.03, 0.14, 3.58, -0.67, 7.74, 5.55, 5.44, -8.21, 8.48, 2.15, 0.04, -3.68, 6.09, 4.06, 2.85, 2.47, -1.37, 3.66, 0.63, 0.46, -1.82, -7.6, 0.05, -3.03, -7.56, 1.56, 2.44, -2.56, -9.01, -0.19, -5.88, -7.51, -5.84, 3.79, -18, 5.33, -4.15, -6.26, 0.53, 15.21, 4.85, 1.98, -1.25, -1.12, -5.65, -0.96, 11.19, 2.76, -2.89, 0.49, -1.83, -2.52, -1.03, -1.54, -1.22, 4.27, -2.39, 0, -3.61, 0.93, -8.6, -4.41, 5.23, -3.77, 0.99, -6.99, -1.57, 3, -0.47, -3.44, 10.14, 2.8, 1.28, -0.16, 8, 4.47, 1.46, 0.86, -3.14, 1.47, 2.22, -0.05, -1.66, 3.6, -2.25, -5.84, 5.91, 2.39, 4.85, 5.07, -2.37, 0.86, -4.37, -3.32, 2.24, -3.78, 1.35, 0.01, -1.53, -3.88, 2.32, -4.27, -1.08, 4.45, 3.55, 1.82, 11.33, 1.49, -1.67, 0.49, -3.35, 0.26, -2.57, -2.51, -13.35, 6.11, 8.47, 3.94, -4.56, -3.28, 3.92, 5.81, -3.57, -1.75, -4.77, 4, -3.46, -2.25, -0.94, -4.16, -11.13, 5.81, -3.29, 5.69, 10.75, 4.29, -0.21, -0.38, 6.03, -1.97, -4.57, 7.61, -5.07, 3.82, 1.73, 8.15, 5.79, 0.19, 1.28, 3.23, -5.88, -10.91, 9.61, -0.47, 6.15, -6.18, -0.29, 1.76, 0.34)
respData1 <- c(27.8, 39.19, 39.31, 42.6, 46.38, 40.21, 44.47, 46.24, 34.38, 69.78, 47.47, 53.41, 52.07, 47.09, 49.82, 13.05, 37.2, 54.27, 42.01, 31.94, 46.56, 10.89, 34.58, 44.43, 51.34, 44.57, 23.28, 46.32, 38.84, 50.78, 34.92, 35.1, 59.31, 40.65, 26.79, 37.85, 45.41, 52.6, 19.58, 36.63, 17.9, 63.94, 51.59, 19.05, 14.15, 37.03, 66.97, 15.58, 29.71, 43.78, 47.02, 37.27, 30.03, 43.11, 36.41, 32.44, 42.13, 46.72, 39.5, 32.4, 45.52, 26.89, 72.18, 51.75, 46.9, 41.5, 22.07, 46.82, 17.87, 50.12, 18.44, 28.21, 68.83, 24.07, 49.43, 43.31, 53.94, 26.36, 7.55, 17.13, 64.75, 36.93, 8.65, 21.06, 44.15, 40.35, 27.84, 42.75, 38.86, 21.84, 52.34, 63.13, 43.23, 38.48, 25.56, 37.81, 19.7, 32.33, 51.69, 40.01, 35.01, 60.59, 47.98, 32.92, 62.64, 15.48, 28.79, 46.04, 60.79, 32.6, 55.21, 41.05, 33.99, 58.24, 50.12, 43.4, 38.2, 58.34, 40.5, 25.68, 60.69, 44.46, 25.28, 40.56, 42.48, 32.15, 52.42, 56.78, 31.09, 18.29, 53.15, 30.62, 43.09, 35.78, 56.31, 20.42, 23.26, 48.99, 26.23, 61, 41.67, 41.04, 11.61, 41.64, 50.24, 18.98, 48.7, 17.97, 38, 50.85, 63.39, 57.49, 19.51, 54.11, 18.01, 33.74, 19.89, 44.66, 23.09, 42.45, 47.84, 39.38, 26.44, 25.24, 46.74, 33.03, 35.28, 35.73, -2.21, 20.49, 54.54, 50.42, 34.82, 47.67, 13.75, 44.62, 33.73, 31.53, 42.63, 36.64, 55.48, 49.84, 41.98, 69.24, 48.39, 39.12, 40.55, 41.95, 29.31, 34.22, 32.13, 33.6, 14.66, 23.75, 31.9, 35.76, 29, 50.02, 51.85, 13, 43.69, 45.67, 39.06, 43.92, 47.04, 11.32, 66.35, 56.47, 46.27, -0.56, 58.99, 57.85, 50.48, 24.43, 62.28, 49.07, 29.16, 63.71, 35.43, 25.9, 27.7, 40.02, 36.33, 43.11, 28.98, 51.88, 45.73, 48.29, 44.55, 28.76, 60.29, 14.26, 38.09, 43.13, 47.68, 60.55, 47.78, 29.85, 26.43, 62.71, 31.78, 38.87, 28.99, 56.19, 17.08, 44.7, 51.59, 9.56, 39.35, 48.91, 35.22, 26.53, 36.73, 43.78, 50.2, 32.55, 53.92, 33.67, 32.58, 34.44, 41.82, 51.16, 21.73, 53.09, 35.07, 6.84, 30.26, 33.74, 54.12, 32.41, 57.36, 52.16, 22, 64.32, 42.23, 51.91, 44.38, 47.45, 57.47, 48.04, 28.02, 21.08, 52.87, 30.48, 52.76, 51.07, 21.07, 37.38, 27.2, 35.8, 35.36, 50.08, 61.72, 27.94, 54.76, 57.88, 50.57, 38.38, 58.27, 13.5, 26.88, 33.78, 67.2, 31.6, 43.14, 43.19, 43.39, 43.09, 30.85, 34.1, 72.42, 15.42, 66.27, 43.71, 28.42, 13.3, 46.63, 35.42, 52.04, 55.89, 49.6, 44.02, 24.67, 46.79, 37.98, 39.42, 23.08, 26.36, 30.27, 57.94, 22.79, 60.07, 36.51, 49.22, 22.53, 29.96, 62.01, 42.3, 30.05, 55.57, 51.68, 23.05, 28.9, 47.02, 63.76, 42.48, 56.21, 44.29, 28.75, 43.53, 29.18, 14.44, 67.68, 16.92, -1, 51.54, 24.68, 77.5, 72.04, 51.44, 59.89, 5.48, 65.2, 35.03, 29.17, 25.99, 65.76, 54.67, 51.44, 51.34, 25.71, 25.48, 45.49, 37.31, 25.55, 59.4, 23.38, 46.47, 5.27, 48.51, 59.98, 34.85, 48.4, 62.56, 27.1, 41.17, 60.38, 57.21, 17.7, 39.84, 9.25, 39.82, 60.86, 53.29, 33.74, 66.61, 66.06, 50.5, 67.98, 21.79, 25.02, 48.24, 69.45, 35.39, 67.24, 53.73, 25.21, 43.4, 50.39, 30.88, 44.33, 6.28)
respData2 <- c(18.26, 36.11, 25.88, 46.64, 38.22, -2.89, 24.17, 32.85, 37.09, 52.34, 12.35, 50.37, 31.17, 22.07, 42.49, 51.39, 34.57, 25.83, 28.45, 37.21, 41.36, 71.02, 46.11, 39.2, 36.16, 41.46, 40.83, 59.73, 30.75, 1.55, 26.67, 49.85, 35.61, 50.58, 39.23, 40.37, 45.63, 35.29, 46.06, 44.51, 36.47, 51.52, 46.69, 36.55, 53.82, 66.62, 47.33, 54.24, 27.13, 45.48, 53.06, 21.14, 52.55, 51.62, 47.59, 40.92, 27.66, 34.75, 30.73, 61.73, 36.3, 48.03, 34.31, 61.52, 27.52, 32.32, 45.33, 31.56, 32.85, 33.09, 44.52, 15.36, 48.63, 37.8, 45.67, 42.7, 40.2, 13.56, 29.57, 48.77, 29.23, 34.8, 26.91, 50.36, 29.04, 27.91, 0.98, 44.37, 29.11, 36.25, 37.73, 36.64, 18.95, 41.73, 54.34, 34.53, 56.36, 44.06, 39.32, 21.44, 53.32, 27.81, 49.95, 49.67, 68.74, 31.31, 57.06, 33.94, 39.64, 7.46, 52.8, 34.4, 70.42, 32.35, 49.45, 47.13, 34.96, 42.26, 38.28, 28.98, 37.15, 49.47, 29.23, 31.53, 28.17, 35.08, 32.34, 33.13, 33.98, 31.66, 29.63, 38.07, 49.42, 48.03, 45.81, 42.76, 63.01, 31.79, 36.26, 35.28, 34.19, 24.96, 9.13, 30.69, 36.83, 22.96, 52.03, 52.85, 49.96, 54.53, 31.88, 14.11, 51.02, 28.36, 40.92, 53.8, 63.55, 49.42, 16.49, 26.25, 34.56, 34.24, 29.5, 56.65, 33.47, 57.91, 54.78, 40.52, 41.14, 43.87, 28.43, 25.15, 38.2, 52.35, 40.83, 58.92, 37.48, 50.09, 33.76, 46.91, 30.51, 52.1, 45.28, 25.65, 28.95, 43.69, 49.32, 32.96, 34.64, 45.21, 30.77, 43.83, 45.89, 27.21, 38.51, 23.67, 37.26, 49.04, 42.06, 7.7, 36.93, 20.52, 29.9, 30.13, 55.83, 18.76, 35.06, 36.68, 34.92, 59.64, 41.81, 22.45, 28.44, 77.59, 59.44, 19.26, 34.14, 63.37, 24.93, 24.94, 16.79, 38.96, 34.77, 55, 43.88, 43.47, 29, 38.99, 20.2, 59.86, 42.71, 52.67, 47.27, 42.85, 15.67, 54, 68.67)
respData3 <- c(52.15, 49.27, 49.14, 40.6, 50.46, 58.2, 45.66, 20.46, 49.97, 42.83, 25.73, 28.75, 27.69, 22.29, 53.53, 52.18, 41.86, 69.86, 37.78, 31.75, 46.9, 32.98, 52.79, 35.54, 34.51, 26.63, 39.69, 27, 47.79, 19.5, 45.63, 26.92, 44.14, 16.93, 42.78, 56.02, 47.01, 46.89, 19.48, 25.95, 61.74, 47.83, 45.1, 41.11, 64.28, 27.7, 8.05, 49.37, 43.05, 49.23, 33.99, 25.97, 60.66, 44.42, 37.06, 40.95, 21.97, 18.88, 34.68, 41.47, 35.65, 36.49, 31.45, 36.02, 38.67, 28.87, 47.33, 48.99, 39.26, 59.34, 57.07, 39.02, 35.12, 50.94, 48.7, 59.07, 36.1, 41.1, 48.53, 36.84, 20.57, 67.63, 52.86, 28.83, 47.47, 43.59, 63.2, 59.74, 23.63, 44.15, 49.71, 41.84, 28.01, 24.4, 36.04, 46.16, 39.15, 50.76, 27.61, 11.67, 27.04, 39.95, 19.58, 31.08, 70.48, 41.18, 21.73, 51.49, 45.3, 61.05, 18.11, 26.11, 41.49, 51.15, 45.32, 35.03, 45.41, 48.5, 41.7, 56.59, 44.21, 51.48, 22.45, 38.89, 42.35, 47.44, 44.4, 47.43, 38.98, 15.87, 73.93, 57.64, 55.71, 56.87, 40.79, 35.31, 56.81, 31.64, 51.35, 40.61, 44.84, 63.09, 56.03, 36.39, 54.85, 35.81, 5.16, 25.41, 26.6, 39.91, 18.4, 30.47, 28.81, 35.49, 21.98, 54.58, 37.43, 45.14, 26.94, 42.56, 61.86, 63.86, 22.83, 41.93, 43.3, 35.72, 30.3, 23.57, 62.44, 28.81, 45.11, 38.81, 37.09, 35.89, 28.37, 25.34, 20.95, 37.1, 50.87, 44.03, 25.36, 46.32, 40.63, 13.77, 33.89, 35.41, 42.71, 57.71, 46.37, 48.02, 42.58, 20.81, 64.81, 42.34, 44.93, 24.07, 71.44, 49.04, 65.73, 30.5, 56.97, 77.54, 37.53, 49.04, 52.37, 44.21, 49.64, 41.62, 33.15, 26.12, 59.01, 50.39, 12.17, 26.18, 51.92, 35.1, 45.64, 62.54, 47.18, 23.61, 13.8, 48.7, 22.64, 23.25, 26.07, 65.36, -3.6, 44.56, 20.4, 28.77, 44.92, 52.98, 38.6, 46.51, 37.8, 41.54, 14.45, 40.69, 61.11, 55.09, 30.34, 39.57, 32.23, 48.52, 44.47, 27.24, 40.09, 48.87, 31.78, 44.45, 50.95, 36.98, 24.06, 22, 39.89, 33.79, 52.91, 18.36, 32.73, 65.36, 39.55, 36.56, 61.26, 70.61, 48.07, 41.77, 83.77, 62.77, 36.37, 38.26, 31.61, 46.1, 58.54, 24.53, 39.71, 58.49, 36, 28.21, 41.23, 31.57, 31.77, 42.54, 22.47, 48.5, 46.3, 30.97, 53.55, 23.35, 60.6, 41.07, 46.19, 22.14, 52.2, 29.76, 42.34, 43.52, 38.48, 49.56, 59.15, 29.13, 20.9, 40.13, 25.61, 56.45, 35.77, 34.6, 27.61, 37.08, 42.26, 36.76, 23.48, 27.94, 43.68, 49.03, 54.34, 57.83, 45.74, 54, 41.25, 36.96, 56.99, 25.3, 37.1, 54.32, 39.65, 59.93, 53.08, 32.52, 25.58, 34.33, 50.6, 49.97, 23.38, 66.08, 39.08, 47.11, 54.24, 55.54, 45.4, 44.52, 36.92, 45.72, 29.69, 13.78, 41.59, 27.96, 40.98, 19.86, 37.85, 23.43, 41.28)
popdata <- data.frame(explanatory=c(expData1, expData2, expData3),
response=c(respData1, respData2, respData3)
)
str(popdata)
## 'data.frame': 1000 obs. of 2 variables:
## $ explanatory: num -4.3 0.19 -2.59 -0.43 0.59 -2.74 3.09 3.51 0.56 5.89 ...
## $ response : num 27.8 39.2 39.3 42.6 46.4 ...
# Plot the whole dataset
ggplot(popdata, aes(x = explanatory, y = response)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Take 2 samples of size 50
set.seed(4747)
sample1 <- popdata %>% sample_n(50)
sample2 <- popdata %>% sample_n(50)
# Plot sample1
plot1 <- ggplot(sample1, aes(x = explanatory, y = response)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "blue")
plot1
# Plot sample2 over sample1
plot1 + geom_point(data = sample2,
aes(x = explanatory, y = response),
color = "red") +
geom_smooth(data = sample2,
aes(x = explanatory, y = response),
method = "lm",
se = FALSE,
color = "red")
# Repeatedly sample the population
manysamples <- infer::rep_sample_n(popdata, size=50, reps=100)
# Plot the regression lines
ggplot(manysamples, aes(x=explanatory, y=response, group=replicate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Fit and tidy many linear models
manylms <- manysamples %>%
group_by(replicate) %>%
do(lm(response ~ explanatory, data=.) %>%
broom::tidy()) %>%
filter(term=="explanatory")
# Plot a histogram of the slope coefficients
ggplot(manylms, aes(x=estimate)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Take 100 samples of size 50
manysamples1 <- infer::rep_sample_n(popdata, size=50, reps=100)
# Plot the regression line for each sample
ggplot(manysamples1, aes(x=explanatory, y=response, group=replicate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Take 100 samples of size 10
manysamples2 <- infer::rep_sample_n(popdata, size=10, reps=100)
# Plot the regression line for each sample
ggplot(manysamples2, aes(x=explanatory, y=response, group=replicate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# In order to understand the sampling distribution associated with the slope coefficient, it is valuable to visualize the impact changes in the sample and population have on the slope coefficient. Here, reducing the variance associated with the response variable around the line changes the variability associated with the slope statistics.
# The new popdata is already loaded in your workspace.
# Take 100 samples of size 50
oldPopData <- popdata
popdata$response <- (oldPopData$response - mean(oldPopData$response)) / sd(oldPopData$response)
popdata$response <- 40 + popdata$response * 11.152
manysamples <- infer::rep_sample_n(popdata, size=50, reps=100)
# Plot a regression line for each sample
ggplot(manysamples, aes(x=explanatory, y=response, group=replicate)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
Chapter 2 - Simulation Based Inference for Slope Parameters
Simulation-based inference - using the twins study from the 1920s (one twin was raise by their parents and the other in a foster home):
Simulation-based inference for slope - can also be calculated using bootstrap for CI (as opposed to testing a null-hypothesis):
Example code includes:
# Load the infer package
library(infer)
twins <- readr::read_csv("./RInputFiles/twins.csv")
## Parsed with column specification:
## cols(
## Foster = col_integer(),
## Biological = col_integer(),
## Social = col_character()
## )
str(twins)
## Classes 'tbl_df', 'tbl' and 'data.frame': 27 obs. of 3 variables:
## $ Foster : int 82 80 88 108 116 117 132 71 75 93 ...
## $ Biological: int 82 90 91 115 115 129 131 78 79 82 ...
## $ Social : chr "high" "high" "high" "high" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 3
## .. ..$ Foster : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Biological: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Social : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Calculate the observed slope
obs_slope <- lm(Foster ~ Biological, data=twins) %>%
broom::tidy() %>%
filter(term == "Biological") %>%
pull(estimate)
# Simulate 10 slopes with a permuted dataset
set.seed(4747)
perm_slope <- twins %>%
specify(Foster ~ Biological) %>%
hypothesize(null = "independence") %>%
generate(reps = 10, type = "permute") %>%
calculate(stat = "slope")
# Print the observed slope and the 10 permuted slopes
obs_slope
## [1] 0.901436
perm_slope
## # A tibble: 10 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.143
## 2 2 0.0710
## 3 3 -0.456
## 4 4 0.0749
## 5 5 0.297
## 6 6 0.0673
## 7 7 0.140
## 8 8 0.164
## 9 9 0.0971
## 10 10 0.184
# Make a dataframe with replicates and plot them!
set.seed(4747)
perm_slope <- twins %>%
specify(Foster ~ Biological) %>%
hypothesize(null = "independence") %>%
generate(reps = 500, type = "permute") %>%
calculate(stat = "slope")
ggplot(perm_slope, aes(x=stat)) +
geom_density()
# Calculate the mean and the standard deviation of the slopes
mean(perm_slope$stat)
## [1] 0.006285095
sd(perm_slope$stat)
## [1] 0.1963073
# Calculate the absolute value of the slope
abs_obs_slope <- lm(Foster ~ Biological, data=twins) %>%
broom::tidy() %>%
filter(term == "Biological") %>%
pull(estimate) %>%
abs()
# Compute the p-value
perm_slope %>%
mutate(abs_perm_slope=abs(stat)) %>%
summarize(p_value = mean(abs_perm_slope > abs_obs_slope))
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
# Calculate 1000 bootstrapped slopes
set.seed(4747)
BS_slope <- twins %>%
specify(Foster ~ Biological) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "slope")
# Look at the head of BS_slope
head(BS_slope)
## # A tibble: 6 x 2
## replicate stat
## <int> <dbl>
## 1 1 0.946
## 2 2 0.966
## 3 3 0.870
## 4 4 0.930
## 5 5 0.807
## 6 6 0.900
# Create a confidence interval
BS_slope %>%
summarize(lower = mean(stat) - 2 *sd(stat),
upper = mean(stat) + 2 *sd(stat))
## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 0.719 1.08
# Set alpha
alpha <- 0.05
# Create a confidence interval
BS_slope %>%
summarize(low = quantile(stat, alpha/2),
high = quantile(stat, 1 - alpha/2))
## # A tibble: 1 x 2
## low high
## <dbl> <dbl>
## 1 0.724 1.08
Chapter 3 - t-Based Inference for the Slope Parameter
Mathematical approximation for testing and estimating slope parameters (based on the t-distribution):
Intervals in regression - estimating the coefficients by way of confidence intervals (CI):
Different types of intervals - often of interest to know the variability in the predicted value, not just the parameter estimates:
Example code includes:
# twins_perm <- twins %>%
# specify(Foster ~ Biological) %>%
# hypothesize(null="independence") %>%
# generate(reps = 10, type = "permute") %>%
# calculate(stat = "slope")
# The randomized slopes are given in the twins_perm dataframe
# Look at the head of the data
# head(twins_perm)
# Plot the histogram with the t distribution
# twins_perm %>%
# filter(term == "Biological_perm") %>%
# ggplot(aes(x=statistic)) +
# geom_histogram(aes(y = ..density..), bins = 50) +
# stat_function(fun = dt, color = "red", args = list(df=nrow(twins)-2))
# Tidy the model
lm(Foster ~ Biological, data=twins) %>% broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) 9.207599 9.29989643 0.9900754 3.316237e-01
## 2 Biological 0.901436 0.09633286 9.3575128 1.203600e-09
# Create a one-sided p-value
lm(Foster ~ Biological, data=twins) %>%
broom::tidy() %>%
filter(term == "Biological") %>%
select(p.value) %>%
mutate(p_value_1side = p.value/2)
## p.value p_value_1side
## 1 1.2036e-09 6.018e-10
# Test the new hypothesis
lm(Foster ~ Biological, data = twins) %>%
broom::tidy() %>%
filter(term == "Biological") %>%
mutate(statistic_test1 = (estimate - 1) / std.error,
p_value_test1 = 2 * pt(abs(statistic_test1), df=nrow(twins)-2, lower.tail=FALSE))
## term estimate std.error statistic p.value statistic_test1
## 1 Biological 0.901436 0.09633286 9.357513 1.2036e-09 -1.023161
## p_value_test1
## 1 0.3160311
# Find the p-value
# perm_slope %>%
# mutate(abs_perm_slope = abs(stat)) %>%
# summarize(p_value = mean(abs_perm_slope > abs(obs_slope)))
# Set alpha
alpha <- 0.05
# Find the critical value
crit_val <- qt(0.975, df = nrow(twins)-2)
# Tidy the model with the confidence level alpha
lm(Foster ~ Biological, data=twins) %>%
broom::tidy(conf.int=TRUE, conf.level=1-alpha)
## term estimate std.error statistic p.value conf.low
## 1 (Intercept) 9.207599 9.29989643 0.9900754 3.316237e-01 -9.9458964
## 2 Biological 0.901436 0.09633286 9.3575128 1.203600e-09 0.7030348
## conf.high
## 1 28.361094
## 2 1.099837
# Find the lower and upper bounds of the confidence interval
lm(Foster ~ Biological, data=twins) %>%
broom::tidy() %>%
mutate(lower = estimate - crit_val * std.error,
upper = estimate + crit_val * std.error)
## term estimate std.error statistic p.value lower
## 1 (Intercept) 9.207599 9.29989643 0.9900754 3.316237e-01 -9.9458964
## 2 Biological 0.901436 0.09633286 9.3575128 1.203600e-09 0.7030348
## upper
## 1 28.361094
## 2 1.099837
# Create the bootstrap confidence interval
BS_slope %>%
summarize(low = quantile(stat, alpha/2),
high = quantile(stat, 1 - alpha/2))
## # A tibble: 1 x 2
## low high
## <dbl> <dbl>
## 1 0.724 1.08
# Set alpha
alpha <- 0.05
# Find the critical value
crit_val <- qt(1-alpha/2, nrow(twins)-2)
# Create a dataframe of new observations
newtwins <- data.frame(Biological = c(80, 90, 100, 110))
# Find prediction intervals
lm(Foster ~ Biological, data=twins) %>%
broom::augment(newdata = newtwins) %>%
mutate(lowMean = .fitted - crit_val * .se.fit,
upMean = .fitted + crit_val * .se.fit)
## Biological .fitted .se.fit lowMean upMean
## 1 80 81.32248 2.093789 77.01024 85.63472
## 2 90 90.33684 1.572563 87.09809 93.57559
## 3 100 99.35120 1.554979 96.14866 102.55374
## 4 110 108.36556 2.054014 104.13524 112.59588
# Set alpha and find the critical value
alpha <- 0.05
crit_val <- qt(1-alpha/2, df=nrow(twins)-2)
# Find confidence intervals for the response
predMeans <- lm(Foster ~ Biological, data=twins) %>%
broom::augment() %>%
mutate(lowMean = .fitted - crit_val*.se.fit,
upMean = .fitted + crit_val*.se.fit)
# Examine the intervals
head(predMeans)
## Foster Biological .fitted .se.fit .resid .hat .sigma
## 1 82 82 83.12535 1.962975 -1.125351 0.06449924 7.885059
## 2 80 90 90.33684 1.572563 -10.336839 0.04139435 7.588558
## 3 88 91 91.23827 1.543999 -3.238275 0.03990427 7.859737
## 4 108 115 112.87274 2.411531 -4.872739 0.09734432 7.818859
## 5 116 115 112.87274 2.411531 3.127261 0.09734432 7.859970
## 6 117 129 125.49284 3.571300 -8.492843 0.21349013 7.642607
## .cooksd .std.resid lowMean upMean
## 1 0.0007811552 -0.1505319 79.08253 87.16817
## 2 0.0402839133 -1.3659358 87.09809 93.57559
## 3 0.0037993804 -0.4275816 88.05835 94.41820
## 4 0.0237414774 -0.6635515 107.90610 117.83938
## 5 0.0097788932 0.4258588 107.90610 117.83938
## 6 0.2083390637 -1.2389779 118.13761 132.84807
# Plot the data with geom_ribbon()
ggplot(predMeans, aes(x=Biological, y=Foster)) +
geom_point() +
stat_smooth(method="lm", se=FALSE) +
geom_ribbon(aes(ymin = lowMean, ymax = upMean), alpha=.2)
# Plot the data with stat_smooth()
ggplot(twins, aes(x = Biological, y = Foster)) +
geom_point() +
stat_smooth(method="lm", se=TRUE)
# Set alpha and find the critical value
alpha <- 0.05
crit_val <- qt(1-alpha/2, nrow(twins)-2)
# Fit a model and use glance to find sigma
twin_lm <- lm(Foster ~ Biological, data=twins)
twin_gl <- broom::glance(twin_lm)
# Pull sigma
twin_sig <- pull(twin_gl, sigma)
# Augment the model to find the prediction standard errors
twin_pred <- broom::augment(twin_lm) %>%
mutate(.se.pred = sqrt(twin_sig ** 2 + .se.fit ** 2))
# Create prediction intervals
predResp <- twin_pred %>%
mutate(lowResp = .fitted - crit_val * .se.pred,
upResp = .fitted + crit_val * .se.pred)
# Plot the intervals using geom_ribbon()
ggplot(predResp, aes(x=Biological, y=Foster)) +
geom_point() +
stat_smooth(method="lm", se=FALSE) +
geom_ribbon(aes(ymin = lowResp, ymax = upResp), alpha = .2) +
geom_ribbon(data = predMeans, aes(ymin = lowMean, ymax = upMean), alpha = .2, fill = "red")
Chapter 4 - Technical Conditions in Linear Regression
Technical conditions for linear regression:
Effect of an outlier - can have unintended impact on the inferential conclusions:
Moving forward when model assumptions are violated:
Example code includes:
# A dataset containing well behaved observations has been preloaded and is called hypdata_nice. There are two variables in the dataset which are aptly named explanatory and response.
expl <- c(2.14, 3.04, 2.48, 2.91, 3.12, 2.45, 3.62, 3.7, 3.11, 4.18, 3.07, 3, 3.52, 3.3, 3.58, 1.35, 2.91, 3.66, 3.97, 3.23, 3.61, 3.45, 3.36, 2.95, 3.86, 4.38, 1.73, 3.1, 3.68, 3.62, 3.42, 2.61, 3.74, 3.1, 2.07, 3.85, 2.76, 4.03, 0.91, 4.29, 2.44, 3.14, 3.59, 2.03, 1.58, 2.2, 4.65, 2.1, 1.96, 2.57, 3.46, 3.07, 2.49, 3.64, 1.4, 2.64, 1.77, 3.29, 2.96, 2.82, 2.6, 3.23, 5.24, 3.89, 3.18, 2.94, 1.14, 3.04, 1.62, 3.09, 1.18, 2.76, 5.26, 2.34, 3.71, 3.26, 4.15, 2.45, 1.06, 2.11, 4.14, 3.22, 1.34, 1.58, 2.93, 2.14, 3.83, 3.97, 2.94, 2.24, 4.01, 5.46, 3.96, 3.14, 0.79, 3.75, 1.67, 2.95, 4.02, 2.9, 3.14, 3.89, 3.42, 3.36, 3.6, 2.03, 2.22, 3.82, 3.01, 1.51, 2.52, 3.23, 2.63, 5.22, 4.25, 3.26, 2.09, 4.08, 3.62, 2.08, 4.71, 2.16, 2.82, 3.16, 2.3, 1.77, 3.72, 3.91, 2.03, 0.3, 3.32, 2.79, 3.27, 2.71, 4.11, 2.15, 1.61, 4.23, 2.82, 4.99, 2.18, 3.02, 3.5, 1.32, 2.52, 1.61, 4.58, 1.99, 2.95, 2.87, 4.68, 3.84, 2.03, 3.2, 1.89, 2.15, 2.55, 3.44, 2.99, 2.47, 2.77, 3.66, 1.9, 3.77, 3.42, 3.96, 2.78, 0.81, -0.34, 1.28, 3.76, 4.19, 3.07, 3.03, 1.28, 2.31, 1.97, 1.67, 2.9, 2.6, 3.71, 3.94, 2.88, 4.74, 3.18, 3.14, 3.72, 2.38, 2.4, 4.16, 3.77, 3.16, 2.45, 1.75, 2.39, 3.26, 3.32, 3.35, 3.93, 1.55)
resp <- c(19.04, 21.44, 19.21, 20.63, 21.66, 15.99, 26.76, 28.85, 20.9, 19.16, 20.53, 22.79, 22.55, 21.72, 26.31, 18.15, 18.3, 28.03, 24.12, 20.47, 23.03, 22.06, 21.91, 20.4, 22.89, 28.62, 16.63, 22.41, 23.45, 20.06, 25.29, 15.54, 22.88, 20.3, 18.87, 26.13, 19.63, 23.3, 12.38, 27.77, 18.67, 23.12, 19.87, 20.51, 17.92, 20.25, 29.82, 13.36, 18.36, 21.2, 23.69, 17.57, 18.19, 23.12, 15.58, 18.53, 20, 21.8, 19.32, 18.8, 20.24, 24.26, 27.59, 26.02, 20.1, 17.24, 12.87, 20.05, 16.27, 19.7, 20.37, 22.49, 24.61, 23.41, 20.51, 20.4, 24, 23.17, 19.66, 17.06, 23.15, 18.48, 16.99, 14.89, 23.4, 19.94, 20.93, 24.21, 19.86, 17.71, 22.74, 25.86, 25.75, 21.4, 16.65, 25.84, 18.49, 18.95, 26, 19.74, 19.39, 25.16, 22.94, 22.01, 26.12, 19.21, 18.62, 25.96, 19.53, 15.48, 22.79, 19.91, 21.65, 29.06, 22.07, 19.59, 21.01, 26.21, 23.75, 18.61, 26.75, 17.06, 18.28, 21.06, 17.18, 17.07, 22.46, 21.55, 14.53, 10.98, 19.34, 21.72, 23.16, 21.76, 23.11, 16.42, 19.64, 24.33, 19.44, 26.39, 22.03, 19.44, 20.14, 17.29, 21.89, 17.28, 25.25, 17.96, 20.94, 21, 27.3, 20.64, 23.03, 17.6, 13.78, 19.08, 16.24, 27.76, 21.19, 21.96, 21.25, 20.94, 20.3, 22.67, 21.8, 21.34, 23.33, 15.82, 12.92, 16.54, 23.08, 24.23, 22.24, 19.79, 15.31, 20.81, 17.52, 20.92, 14.01, 18.63, 27.18, 21.96, 22.9, 26.57, 22.39, 22.09, 24.67, 20.51, 18.77, 23.92, 19.28, 21.14, 21.12, 15.82, 18.52, 25.09, 25.27, 22.47, 26.35, 16.52)
hypdata_nice <- data.frame(response=resp, explanatory=expl)
str(hypdata_nice)
## 'data.frame': 200 obs. of 2 variables:
## $ response : num 19 21.4 19.2 20.6 21.7 ...
## $ explanatory: num 2.14 3.04 2.48 2.91 3.12 2.45 3.62 3.7 3.11 4.18 ...
# Plot the data
ggplot(hypdata_nice, aes(x=explanatory, y=response)) +
geom_point()
# Create and augmented model
nice_lm <- lm(response ~ explanatory, data=hypdata_nice) %>%
broom::augment()
# Print the head of nice_lm
head(nice_lm)
## response explanatory .fitted .se.fit .resid .hat .sigma
## 1 19.04 2.14 18.69469 0.2035616 0.3453107 0.008481517 2.215806
## 2 21.44 3.04 21.23226 0.1568479 0.2077393 0.005035457 2.215894
## 3 19.21 2.48 19.65333 0.1738709 -0.4433274 0.006187790 2.215717
## 4 20.63 2.91 20.86572 0.1564783 -0.2357226 0.005011754 2.215879
## 5 21.66 3.12 21.45782 0.1584301 0.2021774 0.005137562 2.215896
## 6 15.99 2.45 19.56874 0.1760204 -3.5787417 0.006341730 2.201131
## .cooksd .std.resid
## 1 1.052794e-04 0.15689185
## 2 2.246532e-05 0.09422272
## 3 1.260166e-04 -0.20119317
## 4 2.878777e-05 -0.10691363
## 5 2.171440e-05 0.09170476
## 6 8.418718e-03 -1.62424910
# Plot the residuals
ggplot(nice_lm, aes(x=.fitted, y=.resid)) +
geom_point()
# A dataset containing poorly behaved observations has been preloaded and is called hypdata_poor. There are two variables in the dataset which are aptly named explanatory and response
resp <- c(19.08, 21.62, 19.15, 20.57, 21.84, 15.23, 29.92, 33.73, 20.66, 13.32, 20.16, 23.68, 22.54, 21.6, 29.13, 17.46, 17.2, 32.23, 24.34, 19.71, 23.2, 21.85, 21.8, 20.19, 22.25, 32.75, 16.7, 23.01, 23.8, 17.81, 27.43, 14.22, 22.57, 19.74, 18.89, 28.52, 19.39, 22.5, 13.66, 31.09, 18.52, 24.1, 17.58, 20.55, 17.67, 20.41, 34.94, 13.12, 18.35, 21.63, 24.64, 15.61, 17.88, 23.29, 15.76, 18.09, 19.68, 21.75, 18.56, 18.12, 20.38, 25.84, 27.38, 28.25, 19.27, 15.54, 13.97, 19.5, 16.38, 18.85, 18.39, 23.33, 19.43, 24.15, 18.26, 19.53, 23.52, 24.03, 17.56, 16.99, 21.78, 16.54, 16.67, 15.28, 24.62, 20.05, 18.58, 24.52, 19.41, 17.59, 21.46, 21.5, 27.59, 21.39, 15.27, 28.11, 18.25, 18.05, 27.96, 19.3, 18.25, 26.58, 23.42, 21.95, 28.78, 19.23, 18.62, 28.23, 18.78, 15.74, 23.63, 18.82, 22.2, 31.28, 19.05, 18.21, 21.14, 28.26, 24.47, 18.63, 27.6, 16.95, 17.39, 20.82, 16.92, 17.1, 21.87, 19.48, 14.47, 12.61, 17.62, 22.25, 24.02, 22.34, 21.83, 16.26, 19.09, 23.91, 19.02, 25.52, 22.33, 18.63, 18.39, 16.83, 22.5, 17.19, 24.61, 17.96, 20.98, 21.17, 29, 18.01, 23.1, 15.21, 13.99, 19.13, 15.29, 31.68, 21.3, 22.55, 21.61, 19.24, 20.18, 22.09, 21.47, 18.84, 24.5, 14.98, 10.67, 16.3, 22.91, 23.86, 22.79, 19.12, 15.5, 21.1, 17.53, 20.27, 10.99, 18.28, 30.65, 20.17, 23.9, 27.05, 22.89, 22.47, 25.97, 20.77, 18.68, 23.3, 15.71, 20.94, 21.52, 16, 18.39, 27.17, 27.45, 22.74, 28.83, 16.55)
expl <- c(2.14, 3.04, 2.48, 2.91, 3.12, 2.45, 3.62, 3.7, 3.11, 4.18, 3.07, 3, 3.52, 3.3, 3.58, 1.35, 2.91, 3.66, 3.97, 3.23, 3.61, 3.45, 3.36, 2.95, 3.86, 4.38, 1.73, 3.1, 3.68, 3.62, 3.42, 2.61, 3.74, 3.1, 2.07, 3.85, 2.76, 4.03, 0.91, 4.29, 2.44, 3.14, 3.59, 2.03, 1.58, 2.2, 4.65, 2.1, 1.96, 2.57, 3.46, 3.07, 2.49, 3.64, 1.4, 2.64, 1.77, 3.29, 2.96, 2.82, 2.6, 3.23, 5.24, 3.89, 3.18, 2.94, 1.14, 3.04, 1.62, 3.09, 1.18, 2.76, 5.26, 2.34, 3.71, 3.26, 4.15, 2.45, 1.06, 2.11, 4.14, 3.22, 1.34, 1.58, 2.93, 2.14, 3.83, 3.97, 2.94, 2.24, 4.01, 5.46, 3.96, 3.14, 0.79, 3.75, 1.67, 2.95, 4.02, 2.9, 3.14, 3.89, 3.42, 3.36, 3.6, 2.03, 2.22, 3.82, 3.01, 1.51, 2.52, 3.23, 2.63, 5.22, 4.25, 3.26, 2.09, 4.08, 3.62, 2.08, 4.71, 2.16, 2.82, 3.16, 2.3, 1.77, 3.72, 3.91, 2.03, 0.3, 3.32, 2.79, 3.27, 2.71, 4.11, 2.15, 1.61, 4.23, 2.82, 4.99, 2.18, 3.02, 3.5, 1.32, 2.52, 1.61, 4.58, 1.99, 2.95, 2.87, 4.68, 3.84, 2.03, 3.2, 1.89, 2.15, 2.55, 3.44, 2.99, 2.47, 2.77, 3.66, 1.9, 3.77, 3.42, 3.96, 2.78, 0.81, -0.34, 1.28, 3.76, 4.19, 3.07, 3.03, 1.28, 2.31, 1.97, 1.67, 2.9, 2.6, 3.71, 3.94, 2.88, 4.74, 3.18, 3.14, 3.72, 2.38, 2.4, 4.16, 3.77, 3.16, 2.45, 1.75, 2.39, 3.26, 3.32, 3.35, 3.93, 1.55)
hypdata_poor <- data.frame(response=resp, explanatory=expl)
str(hypdata_poor)
## 'data.frame': 200 obs. of 2 variables:
## $ response : num 19.1 21.6 19.1 20.6 21.8 ...
## $ explanatory: num 2.14 3.04 2.48 2.91 3.12 2.45 3.62 3.7 3.11 4.18 ...
# Plot the data
ggplot(hypdata_poor, aes(x=explanatory, y=response)) +
geom_point()
# Create an augmented model
poor_lm <- lm(response ~ explanatory, data=hypdata_poor) %>%
broom::augment()
# Plot the residuals
ggplot(poor_lm, aes(x=.fitted, y=.resid)) +
geom_point()
# The data provided in this exercise (hypdata_out) has an extreme outlier. You will run the linear model with and without the outlying point to see how one observation can affect the estimate of the line.
expl <- c(2.14, 3.04, 2.48, 2.91, 3.12, 2.45, 3.62, 3.7, 3.11, 4.18, 3.07, 3, 3.52, 3.3, 3.58, 1.35, 2.91, 3.66, 3.97, 3.23, 3.61, 3.45, 3.36, 2.95, 3.86, 4.38, 1.73, 3.1, 3.68, 3.62, 3.42, 2.61, 3.74, 3.1, 2.07, 3.85, 2.76, 4.03, 0.91, 4.29, 2.44, 3.14, 3.59, 2.03, 1.58, 2.2, 4.65, 2.1, 1.96, 2.57)
resp <- c(23.06, 21.85, 14.4, 27.13, 5.31, 15.71, 10.51, 26, 20.96, 22.71, 17.17, 23.26, 44.96, 30.77, 24.49, 15.47, 2.14, 23.32, 10.12, 22.58, 4.63, 19.91, 44.7, 14.24, 30.7, 27.72, 28.72, 15.84, 3.65, 13.99, 33.68, 22.02, 6.66, 7.07, 17.56, 14.94, 28.6, 33.76, 14.16, 17.31, 29.39, 46.02, 32.34, 19.49, -5.37, 26.08, 500, 17.81, 28.03, 18.73)
hypdata_out <- data.frame(response=resp, explanatory=expl)
str(hypdata_out)
## 'data.frame': 50 obs. of 2 variables:
## $ response : num 23.06 21.85 14.4 27.13 5.31 ...
## $ explanatory: num 2.14 3.04 2.48 2.91 3.12 2.45 3.62 3.7 3.11 4.18 ...
# Plot the data and a linear model
ggplot(hypdata_out, aes(x=explanatory, y=response)) +
geom_point() +
stat_smooth(method="lm", se=FALSE)
# Remove the outlier
hypdata_noout <- hypdata_out %>%
filter(explanatory < 4.6)
# Plot all the data and both models
ggplot(hypdata_out, aes(x=explanatory, y=response)) +
geom_point() +
stat_smooth(method="lm", se=FALSE) +
stat_smooth(data=hypdata_noout, method="lm", se=FALSE, color="red")
# Examine the tidy model
lm(response ~ explanatory, data=hypdata_out) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -46.27359 36.60763 -1.264042 0.21231907
## 2 explanatory 24.99987 11.55650 2.163274 0.03552965
# Examine the new tidy model
lm(response ~ explanatory, data=hypdata_noout) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) 14.769176 6.185344 2.3877698 0.02102301
## 2 explanatory 1.957367 1.976048 0.9905467 0.32697934
# The data frames perm_slope_out and perm_slope_noout are also in your workspace. These data frames hold the permuted slopes for each of the original datsets
# Finally, the observed values are stored in the variables obs_slope_out and obs_slope_noout.
# Calculate the p-value with the outlier
# perm_slope_out %>%
# mutate(abs_perm_slope = abs(stat)) %>%
# summarize(p_value = mean(abs_perm_slope > abs(obs_slope_out)))
# Calculate the p-value without the outlier
# perm_slope_noout %>%
# mutate(abs_perm_slope = abs(stat)) %>%
# summarize(p_value = mean(abs_perm_slope > abs(obs_slope_noout)))
# The dataset data_nonlin has been preloaded.
expl <- c(0.19, 0.34, 0.51, 0.45, 0.3, 0.92, 0.47, 0.93, 0.55, 0.93, 0.29, 0, 0.73, 0.69, 0.76, 0.86, 0.54, 0.95, 0.88, 0.64, 0.53, 0.39, 0.5, 0.53, 0.7, 0.11, 0.62, 0.69, 0.72, 0.27, 0.05, 0.45, 0.46, 0.51, 0.74, 0.86, 0.83, 0.17, 0.59, 0.45, 0.73, 0.31, 0.67, 0.83, 0.64, 0.35, 0.48, 0.2, 0.8, 0.83, 0.92, 0.34, 0.1, 0.91, 0.54, 0.08, 0.75, 0.45, 0.73, 0.11, 0.66, 0.31, 0.35, 0.18, 0.77, 0.96, 0.54, 0.59, 0.18, 0.15, 0.8, 0.21, 0.4, 0.27, 0.85, 0.64, 0.02, 0.84, 0.9, 0.42, 0.29, 0.83, 0.56, 0.78, 0.72, 0.51, 0.17, 0.18, 0.08, 0.71, 0.21, 0.97, 0.95, 0.64, 0.18, 0.55, 0.15, 0.72, 0.33, 0.73)
resp <- c(11.62, 12.81, 15.01, 15.05, 10.67, 25.18, 13.41, 25.96, 16.09, 25.48, 11.75, 10.24, 22.46, 20.02, 21.09, 23.65, 14.23, 26.45, 22.89, 18.12, 13.96, 13.25, 17.5, 15.18, 20.11, 10.79, 18.75, 18.65, 17.96, 10.99, 11.33, 14.58, 12.96, 14.1, 20.47, 22.88, 23.77, 11.89, 16.97, 13.6, 21.14, 14.87, 19.76, 23.05, 15.82, 13.64, 13.56, 11.11, 23.14, 22.78, 25.49, 13.64, 10.88, 25.64, 16.58, 9.34, 19.95, 15.15, 20.22, 9.04, 18.09, 12.59, 12.51, 13.24, 22.49, 26.83, 15.11, 18.06, 11.61, 9.84, 23.75, 10.41, 13.49, 12.02, 22.65, 16.75, 10.78, 24.02, 23.93, 11.28, 12.44, 22.55, 16.57, 21.24, 21.07, 14.67, 9.53, 12.24, 10.15, 21.68, 10.5, 27.13, 26.91, 16.33, 10.58, 14.71, 12.36, 18.83, 12.6, 20)
data_nonlin <- data.frame(response=resp, explanatory=expl)
str(data_nonlin)
## 'data.frame': 100 obs. of 2 variables:
## $ response : num 11.6 12.8 15 15.1 10.7 ...
## $ explanatory: num 0.19 0.34 0.51 0.45 0.3 0.92 0.47 0.93 0.55 0.93 ...
# Create an augmented model using the non-linear data
lm_nonlin <- lm(response ~ explanatory, data=data_nonlin) %>%
broom::augment()
# Plot the residuals
ggplot(lm_nonlin, aes(x=.fitted, y=.resid)) +
geom_point() +
geom_abline(slope = 0, intercept = 0)
# Create a second augmented model
lm2_nonlin <- lm(response ~ explanatory + I(explanatory^2), data=data_nonlin) %>%
broom::augment()
# Plot the second set of residuals
ggplot(lm2_nonlin, aes(x=.fitted, y=.resid)) +
geom_point() +
geom_abline(slope = 0, intercept = 0)
# In this next example, it appears as though the variance of the response variable increases as the explanatory variable increases
# Note that the fix in this exercise has the effect of changing both the variability as well as modifying the linearity of the relationship
# The dataset data_nonequalvar has been preloaded
expl <- c(48.9, 78.2, 39.5, 42.9, 79.9, 57.9, 35.1, 50.7, 62.6, 63.3, 38.1, 75.8, 43.6, 48.8, 78, 44.8, 29.2, 57.7, 60.6, 63.3, 47.3, 54.3, 67.3, 54.8, 52.9, 50.6, 78, 62.2, 45.8, 40.5, 52.1, 58.7, 31.1, 61.8, 52.6, 63.7, 37.8, 42.5, 29.5, 51.3, 57.2, 70.3, 46.1, 91.2, 74.9, 47.3, 34.8, 29.3, 42, 57.9, 48.1, 35, 56.7, 71.9, 47.9, 49.9, 36.3, 53.6, 50.3, 60, 53.6, 67.1, 55.6, 44.2, 56.4, 41.5, 30.4, 84.1, 48.8, 59.6, 50.8, 59.3, 94.6, 70.5, 45.4, 44.8, 36.2, 87.6, 68.9, 55.8, 64.9, 33.2, 89.9, 37.9, 54.7, 64.6, 71.6, 65.8, 48.3, 67.5, 62.1, 63.6, 67.8, 54.2, 55.6, 65.4, 55.4, 50.2, 81.3, 57.9, 62.1, 55.3, 75.5, 65, 65.6, 53.1, 71.3, 53.1, 63.3, 45.3, 61, 54, 44.8, 66.5, 55.2, 67.8, 43.2, 46.9, 57.1, 92.2, 70.1, 49.7, 46.2, 67, 29.8, 40.8, 62.6, 60.4, 86.4, 42.2, 42.9, 69.5, 63.1, 46.2, 38.5, 43.7, 53.3, 60.3, 32.6, 72.5)
resp <- c(127.15, 45.06, 15.54, 26.25, 17.78, 47.81, 14.22, 104.44, 134.4, 3.21, 138.4, 59.52, 26.54, 4.43, 65.15, 21.73, 8.6, 132.58, 84.87, 242.71, 23.36, 21.16, 29.59, 100.78, 135.44, 21.39, 90.71, 12.22, 34.61, 104.3, 102.54, 9.25, 32.13, 17.37, 22.74, 20.3, 99.88, 33.7, 26.17, 9.67, 2.23, 173.31, 46.49, 339.71, 110.22, 82.22, 4.93, 6.09, 12.88, 37.66, 59.45, 5.12, 37.84, 67.36, 30.94, 30.22, 12.6, 14.14, 106.09, 52.13, 4.72, 35.19, 7.49, 35.67, 28.08, 56.13, 66.75, 69.87, 65.66, 9.08, 89.92, 20.81, 43.22, 59.37, 21.8, 34.34, 1.65, 92.08, 36.89, 63.7, 23.8, 15.55, 79.21, 35.77, 74.66, 55.85, 58.33, 41.08, 53.43, 47.58, 46.57, 23.1, 305.41, 51.99, 39.4, 49.44, 116.64, 110, 120.17, 41.52, 60.48, 26.31, 121.42, 111.76, 33.76, 43.43, 150.36, 31.19, 30.25, 74.32, 132.18, 34.32, 20.45, 106.13, 47.9, 110.07, 66.47, 19.96, 42.72, 361.12, 281.52, 139.26, 22.22, 26.9, 7.66, 3.78, 52.8, 47.61, 81.24, 80.17, 19.48, 7.72, 43.5, 51.48, 37.18, 15.6, 36.02, 6.85, 15.42, 214.95)
data_nonequalvar <- data.frame(response=resp, explanatory=expl)
str(data_nonequalvar)
## 'data.frame': 140 obs. of 2 variables:
## $ response : num 127.2 45.1 15.5 26.2 17.8 ...
## $ explanatory: num 48.9 78.2 39.5 42.9 79.9 57.9 35.1 50.7 62.6 63.3 ...
# Create an augmented model
lm_nonequalvar <- lm(response ~ explanatory, data=data_nonequalvar) %>%
broom::augment()
# Plot the residuals
ggplot(lm_nonequalvar, aes(x=.fitted, y=.resid)) +
geom_point() +
geom_abline(slope = 0, intercept = 0)
# Create an augmented model using the log of the response
lm2_nonequalvar <- lm(log(response) ~ explanatory, data=data_nonequalvar) %>%
broom::augment()
# Plot the log of the resoponse
ggplot(data_nonequalvar, aes(x=explanatory, y=log(response))) +
geom_point() +
stat_smooth(method="lm", se=FALSE)
# Plot the second set of residuals
ggplot(lm2_nonequalvar, aes(x=.fitted, y=.resid)) +
geom_point() +
geom_abline(slope = 0, intercept = 0)
# In this last example, it appears as though the points are not normally distributed around the regression line
# Again, note that the fix in this exercise has the effect of changing both the variability as well as modifying the linearity of the relationship
# The dataset data_nonnorm has been preloaded
resp <- c(190.58, 187.28, 172.34, 291.5, 43.66, 315.81, 94.42, 417.19, 234.56, 343.66, 127.73, 119.66, 690.5, 416.69, 334.43, 337.93, 64.21, 386.13, 176.61, 280.16, 64.13, 167.81, 578.18, 160.62, 393.2, 147.78, 432.51, 216.36, 88.97, 83.47, 226.85, 235.78, 64.11, 89.33, 295.13, 230.52, 469.54, 241.46, 246.09, 131.25, 453.87, 527.73, 422.67, 356.65, 56.87, 272, 89.89, 138.36, 488.88, 321.64, 388.48, 287.1, 161.48, 423.7, 315.97, 47.3, 207.29, 314.94, 300.95, 26.88, 215.97, 196.3, 144.08, 428.02, 516.09, 423.95, 138.73, 408.28, 202.76, 60.61, 617.6, 75.43, 177.11, 176.66, 246.48, 131.29, 170.23, 485.36, 229.54, 11.85, 200.46, 304.64, 276.42, 277.58, 468.18, 138.25, 37.26, 279.1, 101.36, 628.17, 78.01, 391.71, 462.21, 92.89, 98.36, 97.8, 317.43, 172.27, 172.11, 281.36)
expl <- c(0.19, 0.34, 0.51, 0.45, 0.3, 0.92, 0.47, 0.93, 0.55, 0.93, 0.29, 0, 0.73, 0.69, 0.76, 0.86, 0.54, 0.95, 0.88, 0.64, 0.53, 0.39, 0.5, 0.53, 0.7, 0.11, 0.62, 0.69, 0.72, 0.27, 0.05, 0.45, 0.46, 0.51, 0.74, 0.86, 0.83, 0.17, 0.59, 0.45, 0.73, 0.31, 0.67, 0.83, 0.64, 0.35, 0.48, 0.2, 0.8, 0.83, 0.92, 0.34, 0.1, 0.91, 0.54, 0.08, 0.75, 0.45, 0.73, 0.11, 0.66, 0.31, 0.35, 0.18, 0.77, 0.96, 0.54, 0.59, 0.18, 0.15, 0.8, 0.21, 0.4, 0.27, 0.85, 0.64, 0.02, 0.84, 0.9, 0.42, 0.29, 0.83, 0.56, 0.78, 0.72, 0.51, 0.17, 0.18, 0.08, 0.71, 0.21, 0.97, 0.95, 0.64, 0.18, 0.55, 0.15, 0.72, 0.33, 0.73)
data_nonnorm <- data.frame(response=resp, explanatory=expl)
str(data_nonnorm)
## 'data.frame': 100 obs. of 2 variables:
## $ response : num 190.6 187.3 172.3 291.5 43.7 ...
## $ explanatory: num 0.19 0.34 0.51 0.45 0.3 0.92 0.47 0.93 0.55 0.93 ...
# Create an augmented model of the data
lm_nonnorm <- lm(response ~ explanatory, data=data_nonnorm) %>%
broom::augment()
# Plot the residuals
ggplot(lm_nonnorm, aes(x=.fitted, y=.resid)) +
geom_point() +
geom_abline(slope = 0, intercept = 0)
# Create the second augmented model
lm2_nonnorm <- lm(sqrt(response) ~ explanatory, data=data_nonnorm) %>%
broom::augment()
# Plot the square root of the response
ggplot(data_nonnorm, aes(x=explanatory, y=sqrt(response))) +
geom_point() +
stat_smooth(method="lm", se=FALSE)
# Plot the second set of residuals
ggplot(lm2_nonnorm, aes(x=.fitted, y=.resid)) +
geom_point() +
geom_abline(slope = 0, intercept = 0)
Chapter 5 - Building on Inference in Simple Regression
Inference on transformed variables - interpretation of the coefficients is no longer just slope for y ~ x:
Multicollinearity - process of some or more of the predictor variables being correlated:
Multiple linear regression:
Summary:
Example code includes:
LAhomes <- readr::read_csv("./RInputFiles/LAhomes.csv")
## Parsed with column specification:
## cols(
## city = col_character(),
## type = col_character(),
## bed = col_integer(),
## bath = col_double(),
## garage = col_character(),
## sqft = col_integer(),
## pool = col_character(),
## spa = col_character(),
## price = col_double()
## )
str(LAhomes, give.attr=FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1594 obs. of 9 variables:
## $ city : chr "Long Beach" "Long Beach" "Long Beach" "Long Beach" ...
## $ type : chr NA NA NA NA ...
## $ bed : int 0 0 0 0 0 1 1 1 1 1 ...
## $ bath : num 1 1 1 1 1 1 1 1 1 1 ...
## $ garage: chr NA NA NA "1" ...
## $ sqft : int 513 550 550 1030 1526 552 558 596 744 750 ...
## $ pool : chr NA NA NA NA ...
## $ spa : chr NA NA NA NA ...
## $ price : num 119000 153000 205000 300000 375000 ...
restNYC <- readr::read_csv("./RInputFiles/restNYC.csv")
## Parsed with column specification:
## cols(
## Case = col_integer(),
## Restaurant = col_character(),
## Price = col_integer(),
## Food = col_integer(),
## Decor = col_integer(),
## Service = col_integer(),
## East = col_integer()
## )
str(restNYC, give.attr=FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 168 obs. of 7 variables:
## $ Case : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Restaurant: chr "Daniella Ristorante" "Tello's Ristorante" "Biricchino" "Bottino" ...
## $ Price : int 43 32 34 41 54 52 34 34 39 44 ...
## $ Food : int 22 20 21 20 24 22 22 20 22 21 ...
## $ Decor : int 18 19 13 20 19 22 16 18 19 17 ...
## $ Service : int 20 19 18 17 21 21 21 21 22 19 ...
## $ East : int 0 0 0 0 0 0 0 1 1 1 ...
# Using tidy output, run an lm analysis on price versus sqft for the LAhomes dataset.
# Run one more analysis, but this time on transformed variables: log(price) versus log(sqft).
# Create a tidy model
lm(price ~ sqft, data=LAhomes) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -1661892.391 64459.91198 -25.78180 8.851557e-123
## 2 sqft 1485.995 22.70924 65.43569 0.000000e+00
# Create a tidy model using the log of both variables
lm(log(price) ~ log(sqft), data=LAhomes) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) 2.702788 0.14369289 18.80948 1.972382e-71
## 2 log(sqft) 1.441583 0.01953529 73.79375 0.000000e+00
# Output the tidy model
lm(log(price) ~ log(sqft) + log(bath), data=LAhomes) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) 2.51405101 0.26186485 9.6005668 2.957034e-21
## 2 log(sqft) 1.47120722 0.03952669 37.2206050 1.194181e-218
## 3 log(bath) -0.03904504 0.04528729 -0.8621632 3.887276e-01
# Using the NYC Italian restaurants dataset (compiled by Simon Sheather in A Modern Approach to Regression with R), restNYC,
# you will investigate the effect on the significance of the coefficients when there are multiple variables in the model
# Recall, the p-value associated with any coefficient is the probability of the observed data given that the particular variable is independent of the response AND given that all other variables are included in the model.
# Output the first model
lm(Price ~ Service, data=restNYC) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -11.977811 5.1092741 -2.344327 2.024510e-02
## 2 Service 2.818433 0.2618399 10.763954 7.879529e-21
# Output the second model
lm(Price ~ Service + Food + Decor, data=restNYC) %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -24.6408955 4.7536113 -5.1836160 6.332777e-07
## 2 Service 0.1350457 0.3956525 0.3413239 7.332967e-01
## 3 Food 1.5555712 0.3730821 4.1695147 4.932501e-05
## 4 Decor 1.8473352 0.2175539 8.4913900 1.170666e-14
Chapter 1 - Parallel Slopes
What if you have two groups?
Visualizing parallel slopes models:
Interpreting parallel slopes coefficients:
Three ways to describe a model - Mathematical, Geometric, Syntactic:
Example code includes:
# In this case, we want to understand how the price of MarioKart games sold at auction varies as a function of not only the number of wheels included in the package, but also whether the item is new or used
# A parallel slopes model has the form y ~ x + z, where z is a categorical explanatory variable, and x is a numerical explanatory variable
# Explore the data
data(marioKart, package="openintro")
glimpse(marioKart)
## Observations: 143
## Variables: 12
## $ ID <dbl> 150377422259, 260483376854, 320432342985, 280405224...
## $ duration <int> 3, 7, 3, 3, 1, 3, 1, 1, 3, 7, 1, 1, 1, 1, 7, 7, 3, ...
## $ nBids <int> 20, 13, 16, 18, 20, 19, 13, 15, 29, 8, 15, 15, 13, ...
## $ cond <fct> new, used, new, new, new, new, used, new, used, use...
## $ startPr <dbl> 0.99, 0.99, 0.99, 0.99, 0.01, 0.99, 0.01, 1.00, 0.9...
## $ shipPr <dbl> 4.00, 3.99, 3.50, 0.00, 0.00, 4.00, 0.00, 2.99, 4.0...
## $ totalPr <dbl> 51.55, 37.04, 45.50, 44.00, 71.00, 45.00, 37.02, 53...
## $ shipSp <fct> standard, firstClass, firstClass, standard, media, ...
## $ sellerRate <int> 1580, 365, 998, 7, 820, 270144, 7284, 4858, 27, 201...
## $ stockPhoto <fct> yes, yes, no, yes, yes, yes, yes, yes, yes, no, yes...
## $ wheels <int> 1, 1, 1, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 1, 0, 1, ...
## $ title <fct> ~~ Wii MARIO KART & WHEEL ~ NINTENDO Wii ~ BRAN...
# fit parallel slopes
(mod <- lm(totalPr ~ wheels + cond, data=marioKart))
##
## Call:
## lm(formula = totalPr ~ wheels + cond, data = marioKart)
##
## Coefficients:
## (Intercept) wheels condused
## 37.6673 10.2161 0.8457
# The parallel slopes model mod relating total price to the number of wheels and condition is already in your workspace.
# Augment the model
augmented_mod <- broom::augment(mod)
glimpse(augmented_mod)
## Observations: 143
## Variables: 10
## $ totalPr <dbl> 51.55, 37.04, 45.50, 44.00, 71.00, 45.00, 37.02, 53...
## $ wheels <int> 1, 1, 1, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 1, 0, 1, ...
## $ cond <fct> new, used, new, new, new, new, used, new, used, use...
## $ .fitted <dbl> 47.88342, 48.72916, 47.88342, 47.88342, 58.09954, 3...
## $ .se.fit <dbl> 3.532896, 2.696310, 3.532896, 3.532896, 3.375000, 5...
## $ .resid <dbl> 3.666579, -11.689162, -2.383421, -3.883421, 12.9004...
## $ .hat <dbl> 0.02093127, 0.01219196, 0.02093127, 0.02093127, 0.0...
## $ .sigma <dbl> 24.504954, 24.486658, 24.506118, 24.504709, 24.4820...
## $ .cooksd <dbl> 1.640983e-04, 9.543509e-04, 6.933998e-05, 1.840819e...
## $ .std.resid <dbl> 0.15174745, -0.48163062, -0.09864186, -0.16072186, ...
# scatterplot, with color
data_space <- ggplot(augmented_mod, aes(x = wheels, y = totalPr, color = cond)) +
geom_point()
# single call to geom_line()
data_space +
geom_line(aes(y = .fitted))
# The babies data set contains observations about the birthweight and other characteristics of children born in the San Francisco Bay area from 1960--1967
# We would like to build a model for birthweight as a function of the mother's age and whether this child was her first (parity == 0)
# birthweight=β0+β1⋅age+β2⋅parity+ϵ
data(babies, package="openintro")
str(babies)
## 'data.frame': 1236 obs. of 8 variables:
## $ case : int 1 2 3 4 5 6 7 8 9 10 ...
## $ bwt : int 120 113 128 123 108 136 138 132 120 143 ...
## $ gestation: int 284 282 279 NA 282 286 244 245 289 299 ...
## $ parity : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 27 33 28 36 23 25 33 23 25 30 ...
## $ height : int 62 64 64 69 67 62 62 65 62 66 ...
## $ weight : int 100 135 115 190 125 93 178 140 125 136 ...
## $ smoke : int 0 0 1 0 1 0 0 0 0 1 ...
# build model
lm(bwt ~ age + parity, data=babies)
##
## Call:
## lm(formula = bwt ~ age + parity, data = babies)
##
## Coefficients:
## (Intercept) age parity
## 118.27782 0.06315 -1.65248
# build model
lm(bwt ~ gestation + smoke, data=babies)
##
## Call:
## lm(formula = bwt ~ gestation + smoke, data = babies)
##
## Coefficients:
## (Intercept) gestation smoke
## -0.9317 0.4429 -8.0883
Chapter 2 - Evaluating and Extending Parallel Slopes
Model fit, residuals, and prediction:
Understanding interaction - idea that the model might have both different slopes and different intercepts:
* The interaction term is the product of two (or more) variables - for example Y ~ X1 + X2 + X1:X2
* The R syntax colon(:) means X1X2, or ther interaction between X1 and X2
Interaction terms can change the intepretation of the model, and also of the components of the model
* Including an interaction term in a model is easy—we just have to tell lm() that we want to include that new variable. An expression of the form
* lm(y ~ x + z + x:z, data = mydata)
* The use of the colon (:) here means that the interaction between x and z will be a third term in the model
* Interaction models are easy to visualize in the with ggplot2 because they have the same coefficients as if the models were fit independently to each group defined by the level of the categorical variable
* In this case, new and used MarioKarts each get their own regression line
* To see this, we can set an aesthetic (e.g. color) to the categorical variable, and then add a geom_smooth() layer to overlay the regression line for each color
Simpson’s Paradox:
Example code includes:
mario_kart <- marioKart %>% filter(totalPr <= 75)
# fit parallel slopes
(mod <- lm(totalPr ~ wheels + cond, data=mario_kart))
##
## Call:
## lm(formula = totalPr ~ wheels + cond, data = mario_kart)
##
## Coefficients:
## (Intercept) wheels condused
## 42.370 7.233 -5.585
# R^2 and adjusted R^2
summary(mod)
##
## Call:
## lm(formula = totalPr ~ wheels + cond, data = mario_kart)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.0078 -3.0754 -0.8254 2.9822 14.1646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.3698 1.0651 39.780 < 2e-16 ***
## wheels 7.2328 0.5419 13.347 < 2e-16 ***
## condused -5.5848 0.9245 -6.041 1.35e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.887 on 138 degrees of freedom
## Multiple R-squared: 0.7165, Adjusted R-squared: 0.7124
## F-statistic: 174.4 on 2 and 138 DF, p-value: < 2.2e-16
# add random noise
mario_kart_noisy <- mario_kart %>%
mutate(noise = rnorm(n=n()))
# compute new model
mod2 <- lm(totalPr ~ wheels + cond + noise, data=mario_kart_noisy)
# new R^2 and adjusted R^2
summary(mod2)
##
## Call:
## lm(formula = totalPr ~ wheels + cond + noise, data = mario_kart_noisy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.9844 -2.9707 -0.9883 2.8480 14.2079
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.5665 1.0701 39.777 < 2e-16 ***
## wheels 7.2002 0.5404 13.324 < 2e-16 ***
## condused -5.7546 0.9288 -6.196 6.36e-09 ***
## noise -0.5537 0.3888 -1.424 0.157
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.869 on 137 degrees of freedom
## Multiple R-squared: 0.7207, Adjusted R-squared: 0.7145
## F-statistic: 117.8 on 3 and 137 DF, p-value: < 2.2e-16
# return a vector
predict(mod)
## 1 2 3 4 5 6 7 8
## 49.60260 44.01777 49.60260 49.60260 56.83544 42.36976 36.78493 56.83544
## 9 10 11 12 13 14 15 16
## 44.01777 44.01777 56.83544 56.83544 56.83544 56.83544 44.01777 36.78493
## 17 18 19 20 21 22 23 24
## 49.60260 49.60260 56.83544 36.78493 56.83544 56.83544 56.83544 44.01777
## 25 26 27 28 29 30 31 32
## 56.83544 36.78493 36.78493 36.78493 49.60260 36.78493 36.78493 44.01777
## 33 34 35 36 37 38 39 40
## 51.25061 44.01777 44.01777 36.78493 44.01777 56.83544 56.83544 49.60260
## 41 42 43 44 45 46 47 48
## 44.01777 51.25061 56.83544 56.83544 44.01777 56.83544 36.78493 36.78493
## 49 50 51 52 53 54 55 56
## 44.01777 56.83544 36.78493 44.01777 42.36976 36.78493 36.78493 44.01777
## 57 58 59 60 61 62 63 64
## 44.01777 36.78493 36.78493 56.83544 36.78493 56.83544 36.78493 51.25061
## 65 66 67 68 69 70 71 72
## 56.83544 44.01777 58.48345 51.25061 49.60260 44.01777 49.60260 56.83544
## 73 74 75 76 77 78 79 80
## 56.83544 51.25061 44.01777 36.78493 36.78493 36.78493 44.01777 56.83544
## 81 82 83 84 85 86 87 88
## 44.01777 65.71629 44.01777 56.83544 36.78493 49.60260 49.60260 36.78493
## 89 90 91 92 93 94 95 96
## 44.01777 36.78493 51.25061 44.01777 36.78493 51.25061 42.36976 56.83544
## 97 98 99 100 101 102 103 104
## 51.25061 44.01777 51.25061 56.83544 56.83544 56.83544 36.78493 49.60260
## 105 106 107 108 109 110 111 112
## 51.25061 44.01777 56.83544 49.60260 36.78493 44.01777 51.25061 56.83544
## 113 114 115 116 117 118 119 120
## 64.06828 44.01777 49.60260 44.01777 49.60260 51.25061 42.36976 44.01777
## 121 122 123 124 125 126 127 128
## 56.83544 44.01777 49.60260 44.01777 51.25061 56.83544 56.83544 49.60260
## 129 130 131 132 133 134 135 136
## 56.83544 36.78493 44.01777 44.01777 36.78493 56.83544 36.78493 44.01777
## 137 138 139 140 141
## 36.78493 51.25061 49.60260 36.78493 56.83544
# return a data frame
broom::augment(mod)
## totalPr wheels cond .fitted .se.fit .resid .hat
## 1 51.55 1 new 49.60260 0.7087865 1.94739955 0.02103158
## 2 37.04 1 used 44.01777 0.5465195 -6.97776738 0.01250410
## 3 45.50 1 new 49.60260 0.7087865 -4.10260045 0.02103158
## 4 44.00 1 new 49.60260 0.7087865 -5.60260045 0.02103158
## 5 71.00 2 new 56.83544 0.6764502 14.16455915 0.01915635
## 6 45.00 0 new 42.36976 1.0651119 2.63023994 0.04749321
## 7 37.02 0 used 36.78493 0.7065565 0.23507301 0.02089945
## 8 53.99 2 new 56.83544 0.6764502 -2.84544085 0.01915635
## 9 47.00 1 used 44.01777 0.5465195 2.98223262 0.01250410
## 10 50.00 1 used 44.01777 0.5465195 5.98223262 0.01250410
## 11 54.99 2 new 56.83544 0.6764502 -1.84544085 0.01915635
## 12 56.01 2 new 56.83544 0.6764502 -0.82544085 0.01915635
## 13 48.00 2 new 56.83544 0.6764502 -8.83544085 0.01915635
## 14 56.00 2 new 56.83544 0.6764502 -0.83544085 0.01915635
## 15 43.33 1 used 44.01777 0.5465195 -0.68776738 0.01250410
## 16 46.00 0 used 36.78493 0.7065565 9.21507301 0.02089945
## 17 46.71 1 new 49.60260 0.7087865 -2.89260045 0.02103158
## 18 46.00 1 new 49.60260 0.7087865 -3.60260045 0.02103158
## 19 55.99 2 new 56.83544 0.6764502 -0.84544085 0.01915635
## 20 31.00 0 used 36.78493 0.7065565 -5.78492699 0.02089945
## 21 53.98 2 new 56.83544 0.6764502 -2.85544085 0.01915635
## 22 64.95 2 new 56.83544 0.6764502 8.11455915 0.01915635
## 23 50.50 2 new 56.83544 0.6764502 -6.33544085 0.01915635
## 24 46.50 1 used 44.01777 0.5465195 2.48223262 0.01250410
## 25 55.00 2 new 56.83544 0.6764502 -1.83544085 0.01915635
## 26 34.50 0 used 36.78493 0.7065565 -2.28492699 0.02089945
## 27 36.00 0 used 36.78493 0.7065565 -0.78492699 0.02089945
## 28 40.00 0 used 36.78493 0.7065565 3.21507301 0.02089945
## 29 47.00 1 new 49.60260 0.7087865 -2.60260045 0.02103158
## 30 43.00 0 used 36.78493 0.7065565 6.21507301 0.02089945
## 31 31.00 0 used 36.78493 0.7065565 -5.78492699 0.02089945
## 32 41.99 1 used 44.01777 0.5465195 -2.02776738 0.01250410
## 33 49.49 2 used 51.25061 0.8279109 -1.76060777 0.02869514
## 34 41.00 1 used 44.01777 0.5465195 -3.01776738 0.01250410
## 35 44.78 1 used 44.01777 0.5465195 0.76223262 0.01250410
## 36 47.00 0 used 36.78493 0.7065565 10.21507301 0.02089945
## 37 44.00 1 used 44.01777 0.5465195 -0.01776738 0.01250410
## 38 63.99 2 new 56.83544 0.6764502 7.15455915 0.01915635
## 39 53.76 2 new 56.83544 0.6764502 -3.07544085 0.01915635
## 40 46.03 1 new 49.60260 0.7087865 -3.57260045 0.02103158
## 41 42.25 1 used 44.01777 0.5465195 -1.76776738 0.01250410
## 42 46.00 2 used 51.25061 0.8279109 -5.25060777 0.02869514
## 43 51.99 2 new 56.83544 0.6764502 -4.84544085 0.01915635
## 44 55.99 2 new 56.83544 0.6764502 -0.84544085 0.01915635
## 45 41.99 1 used 44.01777 0.5465195 -2.02776738 0.01250410
## 46 53.99 2 new 56.83544 0.6764502 -2.84544085 0.01915635
## 47 39.00 0 used 36.78493 0.7065565 2.21507301 0.02089945
## 48 38.06 0 used 36.78493 0.7065565 1.27507301 0.02089945
## 49 46.00 1 used 44.01777 0.5465195 1.98223262 0.01250410
## 50 59.88 2 new 56.83544 0.6764502 3.04455915 0.01915635
## 51 28.98 0 used 36.78493 0.7065565 -7.80492699 0.02089945
## 52 36.00 1 used 44.01777 0.5465195 -8.01776738 0.01250410
## 53 51.99 0 new 42.36976 1.0651119 9.62023994 0.04749321
## 54 43.95 0 used 36.78493 0.7065565 7.16507301 0.02089945
## 55 32.00 0 used 36.78493 0.7065565 -4.78492699 0.02089945
## 56 40.06 1 used 44.01777 0.5465195 -3.95776738 0.01250410
## 57 48.00 1 used 44.01777 0.5465195 3.98223262 0.01250410
## 58 36.00 0 used 36.78493 0.7065565 -0.78492699 0.02089945
## 59 31.00 0 used 36.78493 0.7065565 -5.78492699 0.02089945
## 60 53.99 2 new 56.83544 0.6764502 -2.84544085 0.01915635
## 61 30.00 0 used 36.78493 0.7065565 -6.78492699 0.02089945
## 62 58.00 2 new 56.83544 0.6764502 1.16455915 0.01915635
## 63 38.10 0 used 36.78493 0.7065565 1.31507301 0.02089945
## 64 61.76 2 used 51.25061 0.8279109 10.50939223 0.02869514
## 65 53.99 2 new 56.83544 0.6764502 -2.84544085 0.01915635
## 66 40.00 1 used 44.01777 0.5465195 -4.01776738 0.01250410
## 67 64.50 3 used 58.48345 1.2882085 6.01655183 0.06947257
## 68 49.01 2 used 51.25061 0.8279109 -2.24060777 0.02869514
## 69 47.00 1 new 49.60260 0.7087865 -2.60260045 0.02103158
## 70 40.10 1 used 44.01777 0.5465195 -3.91776738 0.01250410
## 71 41.50 1 new 49.60260 0.7087865 -8.10260045 0.02103158
## 72 56.00 2 new 56.83544 0.6764502 -0.83544085 0.01915635
## 73 64.95 2 new 56.83544 0.6764502 8.11455915 0.01915635
## 74 49.00 2 used 51.25061 0.8279109 -2.25060777 0.02869514
## 75 48.00 1 used 44.01777 0.5465195 3.98223262 0.01250410
## 76 38.00 0 used 36.78493 0.7065565 1.21507301 0.02089945
## 77 45.00 0 used 36.78493 0.7065565 8.21507301 0.02089945
## 78 41.95 0 used 36.78493 0.7065565 5.16507301 0.02089945
## 79 43.36 1 used 44.01777 0.5465195 -0.65776738 0.01250410
## 80 54.99 2 new 56.83544 0.6764502 -1.84544085 0.01915635
## 81 45.21 1 used 44.01777 0.5465195 1.19223262 0.01250410
## 82 65.02 4 used 65.71629 1.7946635 -0.69628856 0.13483640
## 83 45.75 1 used 44.01777 0.5465195 1.73223262 0.01250410
## 84 64.00 2 new 56.83544 0.6764502 7.16455915 0.01915635
## 85 36.00 0 used 36.78493 0.7065565 -0.78492699 0.02089945
## 86 54.70 1 new 49.60260 0.7087865 5.09739955 0.02103158
## 87 49.91 1 new 49.60260 0.7087865 0.30739955 0.02103158
## 88 47.00 0 used 36.78493 0.7065565 10.21507301 0.02089945
## 89 43.00 1 used 44.01777 0.5465195 -1.01776738 0.01250410
## 90 35.99 0 used 36.78493 0.7065565 -0.79492699 0.02089945
## 91 54.49 2 used 51.25061 0.8279109 3.23939223 0.02869514
## 92 46.00 1 used 44.01777 0.5465195 1.98223262 0.01250410
## 93 31.06 0 used 36.78493 0.7065565 -5.72492699 0.02089945
## 94 55.60 2 used 51.25061 0.8279109 4.34939223 0.02869514
## 95 40.10 0 new 42.36976 1.0651119 -2.26976006 0.04749321
## 96 52.59 2 new 56.83544 0.6764502 -4.24544085 0.01915635
## 97 44.00 2 used 51.25061 0.8279109 -7.25060777 0.02869514
## 98 38.26 1 used 44.01777 0.5465195 -5.75776738 0.01250410
## 99 51.00 2 used 51.25061 0.8279109 -0.25060777 0.02869514
## 100 48.99 2 new 56.83544 0.6764502 -7.84544085 0.01915635
## 101 66.44 2 new 56.83544 0.6764502 9.60455915 0.01915635
## 102 63.50 2 new 56.83544 0.6764502 6.66455915 0.01915635
## 103 42.00 0 used 36.78493 0.7065565 5.21507301 0.02089945
## 104 47.00 1 new 49.60260 0.7087865 -2.60260045 0.02103158
## 105 55.00 2 used 51.25061 0.8279109 3.74939223 0.02869514
## 106 33.01 1 used 44.01777 0.5465195 -11.00776738 0.01250410
## 107 53.76 2 new 56.83544 0.6764502 -3.07544085 0.01915635
## 108 46.00 1 new 49.60260 0.7087865 -3.60260045 0.02103158
## 109 43.00 0 used 36.78493 0.7065565 6.21507301 0.02089945
## 110 42.55 1 used 44.01777 0.5465195 -1.46776738 0.01250410
## 111 52.50 2 used 51.25061 0.8279109 1.24939223 0.02869514
## 112 57.50 2 new 56.83544 0.6764502 0.66455915 0.01915635
## 113 75.00 3 new 64.06828 1.0000415 10.93171876 0.04186751
## 114 48.92 1 used 44.01777 0.5465195 4.90223262 0.01250410
## 115 45.99 1 new 49.60260 0.7087865 -3.61260045 0.02103158
## 116 40.05 1 used 44.01777 0.5465195 -3.96776738 0.01250410
## 117 45.00 1 new 49.60260 0.7087865 -4.60260045 0.02103158
## 118 50.00 2 used 51.25061 0.8279109 -1.25060777 0.02869514
## 119 49.75 0 new 42.36976 1.0651119 7.38023994 0.04749321
## 120 47.00 1 used 44.01777 0.5465195 2.98223262 0.01250410
## 121 56.00 2 new 56.83544 0.6764502 -0.83544085 0.01915635
## 122 41.00 1 used 44.01777 0.5465195 -3.01776738 0.01250410
## 123 46.00 1 new 49.60260 0.7087865 -3.60260045 0.02103158
## 124 34.99 1 used 44.01777 0.5465195 -9.02776738 0.01250410
## 125 49.00 2 used 51.25061 0.8279109 -2.25060777 0.02869514
## 126 61.00 2 new 56.83544 0.6764502 4.16455915 0.01915635
## 127 62.89 2 new 56.83544 0.6764502 6.05455915 0.01915635
## 128 46.00 1 new 49.60260 0.7087865 -3.60260045 0.02103158
## 129 64.95 2 new 56.83544 0.6764502 8.11455915 0.01915635
## 130 36.99 0 used 36.78493 0.7065565 0.20507301 0.02089945
## 131 44.00 1 used 44.01777 0.5465195 -0.01776738 0.01250410
## 132 41.35 1 used 44.01777 0.5465195 -2.66776738 0.01250410
## 133 37.00 0 used 36.78493 0.7065565 0.21507301 0.02089945
## 134 58.98 2 new 56.83544 0.6764502 2.14455915 0.01915635
## 135 39.00 0 used 36.78493 0.7065565 2.21507301 0.02089945
## 136 40.70 1 used 44.01777 0.5465195 -3.31776738 0.01250410
## 137 39.51 0 used 36.78493 0.7065565 2.72507301 0.02089945
## 138 52.00 2 used 51.25061 0.8279109 0.74939223 0.02869514
## 139 47.70 1 new 49.60260 0.7087865 -1.90260045 0.02103158
## 140 38.76 0 used 36.78493 0.7065565 1.97507301 0.02089945
## 141 54.51 2 new 56.83544 0.6764502 -2.32544085 0.01915635
## .sigma .cooksd .std.resid
## 1 4.902339 1.161354e-03 0.402708933
## 2 4.868399 8.712334e-03 -1.436710863
## 3 4.892414 5.154337e-03 -0.848389768
## 4 4.881308 9.612441e-03 -1.158579529
## 5 4.750591 5.574926e-02 2.926332759
## 6 4.899816 5.053659e-03 0.551419180
## 7 4.905181 1.681147e-05 0.048608215
## 8 4.899077 2.249739e-03 -0.587854989
## 9 4.898517 1.591419e-03 0.614036807
## 10 4.878184 6.403658e-03 1.231731888
## 11 4.902639 9.463096e-04 -0.381259589
## 12 4.904706 1.893237e-04 -0.170532281
## 13 4.845644 2.169149e-02 -1.825361432
## 14 4.904693 1.939387e-04 -0.172598235
## 15 4.904866 8.464177e-05 -0.141610176
## 16 4.840263 2.583436e-02 1.905485609
## 17 4.898859 2.562308e-03 -0.598170028
## 18 4.895349 3.974537e-03 -0.744993181
## 19 4.904680 1.986092e-04 -0.174664189
## 20 4.879726 1.018113e-02 -1.196202689
## 21 4.899034 2.265580e-03 -0.589920943
## 22 4.855017 1.829628e-02 1.676430592
## 23 4.874681 1.115287e-02 -1.308872933
## 24 4.900578 1.102520e-03 0.511087627
## 25 4.902666 9.360817e-04 -0.379193635
## 26 4.901254 1.588345e-03 -0.472475419
## 27 4.904754 1.874384e-04 -0.162306590
## 28 4.897361 3.144719e-03 0.664810290
## 29 4.900072 2.074290e-03 -0.538200007
## 30 4.875781 1.175148e-02 1.285147949
## 31 4.879726 1.018113e-02 -1.196202689
## 32 4.902124 7.357628e-04 -0.417513979
## 33 4.902848 1.315656e-03 -0.365515142
## 34 4.898356 1.629570e-03 -0.621353356
## 35 4.904785 1.039625e-04 0.156942447
## 36 4.825276 3.174557e-02 2.112264829
## 37 4.905222 5.648698e-08 -0.003658274
## 38 4.866239 1.422325e-02 1.478099008
## 39 4.898043 2.628135e-03 -0.635371930
## 40 4.895513 3.908619e-03 -0.738789386
## 41 4.902867 5.591802e-04 -0.363980405
## 42 4.884059 1.170136e-02 -1.090064849
## 43 4.887380 6.523784e-03 -1.001045788
## 44 4.904680 1.986092e-04 -0.174664189
## 45 4.902124 7.357628e-04 -0.417513979
## 46 4.899077 2.249739e-03 -0.587854989
## 47 4.901493 1.492713e-03 0.458031070
## 48 4.903987 4.946184e-04 0.263658603
## 49 4.902261 7.030898e-04 0.408138446
## 50 4.898186 2.575620e-03 0.628991915
## 51 4.858711 1.853266e-02 -1.613896713
## 52 4.856546 1.150293e-02 -1.650845158
## 53 4.832389 6.760627e-02 2.016844445
## 54 4.866054 1.561857e-02 1.481588208
## 55 4.887793 6.965475e-03 -0.989423469
## 56 4.893406 2.802864e-03 -0.814897814
## 57 4.893260 2.837624e-03 0.819935167
## 58 4.904754 1.874384e-04 -0.162306590
## 59 4.879726 1.018113e-02 -1.196202689
## 60 4.899077 2.249739e-03 -0.587854989
## 61 4.870114 1.400524e-02 -1.402981909
## 62 4.904194 3.768392e-04 0.240592564
## 63 4.903908 5.261382e-04 0.271929772
## 64 4.819876 4.687834e-02 2.181827236
## 65 4.899077 2.249739e-03 -0.587854989
## 66 4.893045 2.888492e-03 -0.827251716
## 67 4.876193 4.052940e-02 1.276155547
## 68 4.901375 2.130829e-03 -0.465166678
## 69 4.900072 2.074290e-03 -0.538200007
## 70 4.893644 2.746495e-03 -0.806661880
## 71 4.855070 2.010496e-02 -1.675562463
## 72 4.904693 1.939387e-04 -0.172598235
## 73 4.855017 1.829628e-02 1.676430592
## 74 4.901341 2.149892e-03 -0.467242751
## 75 4.893260 2.837624e-03 0.819935167
## 76 4.904101 4.491639e-04 0.251251850
## 77 4.853667 2.053161e-02 1.698706389
## 78 4.884908 8.116206e-03 1.068029769
## 79 4.904897 7.741876e-05 -0.135433225
## 80 4.902639 9.463096e-04 -0.381259589
## 81 4.904152 2.543452e-04 0.245478742
## 82 4.904806 1.218734e-03 -0.153165404
## 83 4.902961 5.369254e-04 0.356663856
## 84 4.866129 1.426303e-02 1.480164962
## 85 4.904754 1.874384e-04 -0.162306590
## 86 4.885435 7.957044e-03 1.054107431
## 87 4.905151 2.893749e-05 0.063568128
## 88 4.825276 3.174557e-02 2.112264829
## 89 4.904442 1.853526e-04 -0.209556635
## 90 4.904742 1.922448e-04 -0.164374382
## 91 4.897178 4.453937e-03 0.672521687
## 92 4.902261 7.030898e-04 0.408138446
## 93 4.880253 9.971030e-03 -1.183795936
## 94 4.890710 8.029235e-03 0.902965863
## 95 4.901197 3.763354e-03 -0.475846028
## 96 4.891531 5.008164e-03 -0.877088548
## 97 4.864786 2.231340e-02 -1.505279580
## 98 4.880180 5.932118e-03 -1.185514863
## 99 4.905174 2.665668e-05 -0.052028020
## 100 4.858308 1.710282e-02 -1.620831987
## 101 4.834741 2.563232e-02 1.984257737
## 102 4.871414 1.234172e-02 1.376867262
## 103 4.884512 8.274103e-03 1.078368730
## 104 4.900072 2.074290e-03 -0.538200007
## 105 4.894442 5.966763e-03 0.778401443
## 106 4.813060 2.168204e-02 -2.266481255
## 107 4.898043 2.628135e-03 -0.635371930
## 108 4.895349 3.974537e-03 -0.744993181
## 109 4.875781 1.175148e-02 1.285147949
## 110 4.903599 3.854926e-04 -0.302210897
## 111 4.904027 6.625438e-04 0.259383029
## 112 4.904888 1.227157e-04 0.137294864
## 113 4.811529 7.605411e-02 2.285052621
## 114 4.887082 4.300207e-03 1.009361659
## 115 4.895294 3.996633e-03 -0.747061113
## 116 4.893346 2.817046e-03 -0.816956798
## 117 4.889096 6.487255e-03 -0.951786355
## 118 4.904024 6.638336e-04 -0.259635386
## 119 4.862490 3.978837e-02 1.547237493
## 120 4.898517 1.591419e-03 0.614036807
## 121 4.904693 1.939387e-04 -0.172598235
## 122 4.898356 1.629570e-03 -0.621353356
## 123 4.895349 3.974537e-03 -0.744993181
## 124 4.843427 1.458352e-02 -1.858802502
## 125 4.901341 2.149892e-03 -0.467242751
## 126 4.892049 4.819157e-03 0.860378763
## 127 4.877336 1.018587e-02 1.250844068
## 128 4.895349 3.974537e-03 -0.744993181
## 129 4.855017 1.829628e-02 1.676430592
## 130 4.905191 1.279432e-05 0.042404838
## 131 4.905222 5.648698e-08 -0.003658274
## 132 4.899857 1.273496e-03 -0.549288929
## 133 4.905187 1.407252e-05 0.044472630
## 134 4.901733 1.277936e-03 0.443056056
## 135 4.901493 1.492713e-03 0.458031070
## 136 4.896922 1.969670e-03 -0.683122864
## 137 4.899576 2.259209e-03 0.563488472
## 138 4.904792 2.383611e-04 0.155579346
## 139 4.902471 1.108535e-03 -0.393444786
## 140 4.902257 1.186770e-03 0.408404057
## 141 4.901119 1.502601e-03 -0.480425381
# include interaction
lm(totalPr ~ cond + duration + cond:duration, data=mario_kart)
##
## Call:
## lm(formula = totalPr ~ cond + duration + cond:duration, data = mario_kart)
##
## Coefficients:
## (Intercept) condused duration
## 58.268 -17.122 -1.966
## condused:duration
## 2.325
# interaction plot
ggplot(mario_kart, aes(x=duration, y=totalPr, color=cond)) +
geom_point() +
geom_smooth(method="lm", se=FALSE)
slr <- ggplot(mario_kart, aes(y = totalPr, x = duration)) +
geom_point() +
geom_smooth(method = "lm", se = 0)
# model with one slope
lm(totalPr ~ duration, data=mario_kart)
##
## Call:
## lm(formula = totalPr ~ duration, data = mario_kart)
##
## Coefficients:
## (Intercept) duration
## 52.374 -1.317
# plot with two slopes
slr + aes(color=cond)
Chapter 3 - Multiple Regression
Adding a numerical explanatory variable - regressions with 2+ numerical variables:
Conditional interpretation of coefficients:
Adding a third (categorical) variable:
Higher dimensions:
Example code includes:
# Fit the model using duration and startPr
(mod <- lm(totalPr ~ duration + startPr, data=mario_kart))
##
## Call:
## lm(formula = totalPr ~ duration + startPr, data = mario_kart)
##
## Coefficients:
## (Intercept) duration startPr
## 51.030 -1.508 0.233
# One method for visualizing a multiple linear regression model is to create a heatmap of the fitted values in the plane defined by the two explanatory variables
# This heatmap will illustrate how the model output changes over different combinations of the explanatory variables
# This is a multistep process
# First, create a grid of the possible pairs of values of the explanatory variables. The grid should be over the actual range of the data present in each variable. We've done this for you and stored the result as a data frame called grid
# Use augment() with the newdata argument to find the y-hat corresponding to the values in grid
# Add these to the data_space plot by using the fill aesthetic and geom_tile()
# add predictions to grid
grid <- expand.grid(duration=1:10, startPr=seq(0.01, 69.95, by=0.01))
price_hats <- broom::augment(mod, newdata=grid)
# tile the plane
data_space <- mario_kart %>% filter(totalPr <= 75) %>%
ggplot(aes(x=duration, y=startPr)) +
geom_point(aes(col=totalPr))
data_space +
geom_tile(data = price_hats, aes(fill=.fitted), alpha=0.5)
# An alternative way to visualize a multiple regression model with two numeric explanatory variables is as a plane in three dimensions. This is possible in R using the plotly package
# We have created three objects that you will need
# x: a vector of unique values of duration
# y: a vector of unique values of startPr
# plane: a matrix of the fitted values across all combinations of x and y
# draw the 3D scatterplot
p <- plotly::plot_ly(data = mario_kart, z = ~totalPr, x = ~duration, y = ~startPr, opacity = 0.6) %>%
plotly::add_markers()
# draw the plane
x <- c(1, 1.13, 1.261, 1.391, 1.522, 1.652, 1.783, 1.913, 2.043, 2.174, 2.304, 2.435, 2.565, 2.696, 2.826, 2.957, 3.087, 3.217, 3.348, 3.478, 3.609, 3.739, 3.87, 4, 4.13, 4.261, 4.391, 4.522, 4.652, 4.783, 4.913, 5.043, 5.174, 5.304, 5.435, 5.565, 5.696, 5.826, 5.957, 6.087, 6.217, 6.348, 6.478, 6.609, 6.739, 6.87, 7, 7.13, 7.261, 7.391, 7.522, 7.652, 7.783, 7.913, 8.043, 8.174, 8.304, 8.435, 8.565, 8.696, 8.826, 8.957, 9.087, 9.217, 9.348, 9.478, 9.609, 9.739, 9.87, 10)
y <- c(0.01, 1.024, 2.037, 3.051, 4.064, 5.078, 6.092, 7.105, 8.119, 9.133, 10.146, 11.16, 12.173, 13.187, 14.201, 15.214, 16.228, 17.242, 18.255, 19.269, 20.282, 21.296, 22.31, 23.323, 24.337, 25.351, 26.364, 27.378, 28.391, 29.405, 30.419, 31.432, 32.446, 33.46, 34.473, 35.487, 36.5, 37.514, 38.528, 39.541, 40.555, 41.569, 42.582, 43.596, 44.609, 45.623, 46.637, 47.65, 48.664, 49.678, 50.691, 51.705, 52.718, 53.732, 54.746, 55.759, 56.773, 57.787, 58.8, 59.814, 60.827, 61.841, 62.855, 63.868, 64.882, 65.896, 66.909, 67.923, 68.936, 69.95)
grid <- expand.grid(duration=x, startPr=y)
predPr <- broom::augment(mod, newdata=grid)
plane <- matrix(data=predPr$.fitted, nrow=70, ncol=70, byrow=FALSE)
p <- p %>%
plotly::add_surface(x = ~x, y = ~y, z = ~plane, showscale = FALSE)
# Commented due to inability to use in html
# p
# draw the 3D scatterplot
# p <- plotly::plot_ly(data = mario_kart, z = ~totalPr, x = ~duration, y = ~startPr, opacity = 0.6) %>%
# plotly::add_markers(color = ~cond)
# draw two planes
# p %>%
# add_surface(x = ~x, y = ~y, z = ~plane0, showscale = FALSE) %>%
# add_surface(x = ~x, y = ~y, z = ~plane1, showscale = FALSE)
Chapter 4 - Logistic Regression
What is logistic regression?
Visualizing logistic regression:
Three scales approach to visualization:
Using a logistical model - objective is to gain better understanding in to the underlying process:
Example code includes:
# To see this in action, we'll fit a linear regression model to data about 55 students who applied to medical school
# We want to understand how their undergraduate GPAGPA relates to the probability they will be accepted by a particular school (Acceptance)
# The medical school acceptance data is loaded in your workspace as MedGPA
# scatterplot with jitter
# tmpSAT <- readr::read_csv("./RInputFiles/SAT.csv")
tmpAccept <- c(0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0)
tmpGPA <- c(3.62, 3.84, 3.23, 3.69, 3.38, 3.72, 3.89, 3.34, 3.71, 3.89, 3.97, 3.49, 3.77, 3.61, 3.3, 3.54, 3.65, 3.54, 3.25, 3.89, 3.71, 3.77, 3.91, 3.88, 3.68, 3.56, 3.44, 3.58, 3.4, 3.82, 3.62, 3.09, 3.89, 3.7, 3.24, 3.86, 3.54, 3.4, 3.87, 3.14, 3.37, 3.38, 3.62, 3.94, 3.37, 3.36, 3.97, 3.04, 3.29, 3.67, 2.72, 3.56, 3.48, 2.8, 3.44)
MedGPA <- data.frame(Acceptance=tmpAccept, GPA=tmpGPA)
str(MedGPA)
## 'data.frame': 55 obs. of 2 variables:
## $ Acceptance: num 0 1 1 1 1 1 1 0 1 1 ...
## $ GPA : num 3.62 3.84 3.23 3.69 3.38 3.72 3.89 3.34 3.71 3.89 ...
data_space <- ggplot(MedGPA, aes(x=GPA, y=Acceptance)) +
geom_jitter(width = 0, height = 0.05, alpha = 0.5)
# linear regression line
data_space +
geom_smooth(method="lm", se=FALSE)
# filter
MedGPA_middle <- MedGPA %>%
filter(GPA >= 3.375, GPA <= 3.77)
# scatterplot with jitter
data_space <- ggplot(MedGPA_middle, aes(x=GPA, y=Acceptance)) +
geom_jitter(width = 0, height = 0.05, alpha = 0.5)
# linear regression line
data_space +
geom_smooth(method="lm", se=FALSE)
# fit model
(mod <- glm(Acceptance ~ GPA, data = MedGPA, family = binomial))
##
## Call: glm(formula = Acceptance ~ GPA, family = binomial, data = MedGPA)
##
## Coefficients:
## (Intercept) GPA
## -19.207 5.454
##
## Degrees of Freedom: 54 Total (i.e. Null); 53 Residual
## Null Deviance: 75.79
## Residual Deviance: 56.84 AIC: 60.84
# scatterplot with jitter
data_space <- ggplot(MedGPA, aes(x=GPA, y=Acceptance)) +
geom_jitter(width=0, height=0.05, alpha = .5)
# add logistic curve
data_space +
geom_smooth(method="glm", se=FALSE, method.args=structure(list(family = "binomial"), .Names = "family"))
# We have created a data.frame called MedGPA_binned that aggregates the original data into separate bins for each 0.25 of GPA. It also contains the fitted values from the logistic regression model
MedGPA$bin <- round(MedGPA$GPA*4, 0) / 4
str(MedGPA)
## 'data.frame': 55 obs. of 3 variables:
## $ Acceptance: num 0 1 1 1 1 1 1 0 1 1 ...
## $ GPA : num 3.62 3.84 3.23 3.69 3.38 3.72 3.89 3.34 3.71 3.89 ...
## $ bin : num 3.5 3.75 3.25 3.75 3.5 3.75 4 3.25 3.75 4 ...
MedGPA_binned <- MedGPA %>%
group_by(bin) %>%
summarize(mean_GPA=mean(GPA), acceptance_rate=mean(Acceptance), ct=n())
MedGPA_binned
## # A tibble: 6 x 4
## bin mean_GPA acceptance_rate ct
## <dbl> <dbl> <dbl> <int>
## 1 2.75 2.76 0 2
## 2 3.00 3.06 0 2
## 3 3.25 3.29 0.200 10
## 4 3.50 3.51 0.556 18
## 5 3.75 3.75 0.643 14
## 6 4.00 3.91 1.00 9
# binned points and line
data_space <- ggplot(MedGPA_binned, aes(x=mean_GPA, y=acceptance_rate)) +
geom_point() +
geom_line()
# augmented model
MedGPA_plus <- broom::augment(mod, type.predict="response")
# logistic model on probability scale
data_space +
geom_line(data = MedGPA_plus, aes(x=GPA, y=.fitted), color = "red")
# The MedGPA_binned data frame contains the data for each GPA bin, while the MedGPA_plus data frame records the original observations after being augment()-ed by mod
# compute odds for bins
MedGPA_binned <- MedGPA_binned %>%
mutate(odds = acceptance_rate / (1 - acceptance_rate))
# plot binned odds
data_space <- ggplot(MedGPA_binned, aes(x=mean_GPA, y=odds)) +
geom_point() +
geom_line()
# compute odds for observations
MedGPA_plus <- MedGPA_plus %>%
mutate(odds_hat = .fitted / (1 - .fitted))
# logistic model on odds scale
data_space +
geom_line(data=MedGPA_plus, aes(x=GPA, y=odds_hat), color = "red")
# compute log odds for bins
MedGPA_binned <- MedGPA_binned %>%
mutate(log_odds = log(acceptance_rate / (1 - acceptance_rate)))
# plot binned log odds
data_space <- ggplot(MedGPA_binned, aes(x=mean_GPA, y=log_odds)) +
geom_point() +
geom_line()
# compute log odds for observations
MedGPA_plus <- MedGPA_plus %>%
mutate(log_odds_hat = log(.fitted / (1 - .fitted)))
# logistic model on log odds scale
data_space +
geom_line(data=MedGPA_plus, aes(x=GPA, y=log_odds_hat), color = "red")
# create new data frame
new_data <- data.frame(GPA = 3.51)
# make predictions
broom::augment(mod, newdata=new_data, type.predict="response")
## GPA .fitted .se.fit
## 1 3.51 0.4844099 0.08343193
# data frame with binary predictions
tidy_mod <- broom::augment(mod, type.predict="response") %>%
mutate(Acceptance_hat = round(.fitted))
# confusion matrix
tidy_mod %>%
select(Acceptance, Acceptance_hat) %>%
table()
## Acceptance_hat
## Acceptance 0 1
## 0 16 9
## 1 6 24
Chapter 5 - Case Study: Italian Restaurants in NYC
Italian restaurants in NYC - factors that contribute to the price of a meal:
Incorporating another variable:
Higher dimensions - adding the décor dimension to the existing Zagat analysis:
Wrap up:
Example code includes:
nyc <- readr::read_csv("./RInputFiles/nyc.csv")
## Parsed with column specification:
## cols(
## Case = col_integer(),
## Restaurant = col_character(),
## Price = col_integer(),
## Food = col_integer(),
## Decor = col_integer(),
## Service = col_integer(),
## East = col_integer()
## )
str(nyc, give.attr=FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 168 obs. of 7 variables:
## $ Case : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Restaurant: chr "Daniella Ristorante" "Tello's Ristorante" "Biricchino" "Bottino" ...
## $ Price : int 43 32 34 41 54 52 34 34 39 44 ...
## $ Food : int 22 20 21 20 24 22 22 20 22 21 ...
## $ Decor : int 18 19 13 20 19 22 16 18 19 17 ...
## $ Service : int 20 19 18 17 21 21 21 21 22 19 ...
## $ East : int 0 0 0 0 0 0 0 1 1 1 ...
# Price by Food plot
ggplot(nyc, aes(x=Food, y=Price)) +
geom_point()
# Price by Food model
lm(Price ~ Food, data=nyc)
##
## Call:
## lm(formula = Price ~ Food, data = nyc)
##
## Coefficients:
## (Intercept) Food
## -17.832 2.939
# fit model
lm(Price ~ Food + Service, data=nyc)
##
## Call:
## lm(formula = Price ~ Food + Service, data = nyc)
##
## Coefficients:
## (Intercept) Food Service
## -21.159 1.495 1.704
# draw 3D scatterplot
# p <- plot_ly(data = nyc, z = ~Price, x = ~Food, y = ~Service, opacity = 0.6) %>%
# add_markers()
# draw a plane
# p %>%
# add_surface(x = ~x, y = ~y, z = ~plane, showscale = FALSE)
# Price by Food and Service and East
lm(Price ~ Food + Service + East, data=nyc)
##
## Call:
## lm(formula = Price ~ Food + Service + East, data = nyc)
##
## Coefficients:
## (Intercept) Food Service East
## -20.8155 1.4863 1.6647 0.9649
# draw 3D scatterplot
# p <- plot_ly(data = nyc, z = ~Price, x = ~Food, y = ~Service, opacity = 0.6) %>%
# add_markers(color = ~factor(East))
# draw two planes
# p %>%
# add_surface(x = ~x, y = ~y, z = ~plane0, showscale = FALSE) %>%
# add_surface(x = ~x, y = ~y, z = ~plane1, showscale = FALSE)
Chapter 1 - Exploring and Visualizing Time Series in R
Introduction and overview:
Trends, seasonality, and cyclicity:
White noise - simply a time series of independently and identically distributed (iid) data:
Example code includes:
library(forecast)
# Read the data from Excel into R
mydata <- readxl::read_excel("./RInputFiles/exercise1.xlsx")
# Look at the first few lines of mydata
head(mydata)
## # A tibble: 6 x 4
## X__1 Sales AdBudget GDP
## <chr> <dbl> <dbl> <dbl>
## 1 Mar-81 1020 659 252
## 2 Jun-81 889 589 291
## 3 Sep-81 795 512 291
## 4 Dec-81 1004 614 292
## 5 Mar-82 1058 647 279
## 6 Jun-82 944 602 254
# Create a ts object called myts
myts <- ts(mydata[, -1], start = c(1981, 1), frequency = 4)
# Plot the data with facetting
autoplot(myts, facets = TRUE)
# Plot the data without facetting
autoplot(myts, facets = FALSE)
# Plot the three series
data(gold, package="forecast")
data(woolyrnq, package="forecast")
data(gas, package="forecast")
str(gold)
## Time-Series [1:1108] from 1 to 1108: 306 300 303 297 304 ...
str(woolyrnq)
## Time-Series [1:119] from 1965 to 1994: 6172 6709 6633 6660 6786 ...
str(gas)
## Time-Series [1:476] from 1956 to 1996: 1709 1646 1794 1878 2173 ...
autoplot(gold)
autoplot(woolyrnq)
autoplot(gas)
# Find the outlier in the gold series
goldoutlier <- which.max(gold)
# Look at the seasonal frequencies of the three series
frequency(gold)
## [1] 1
frequency(woolyrnq)
## [1] 4
frequency(gas)
## [1] 12
# In this exercise, you will load the fpp2 package and use two of its datasets
# a10 contains monthly sales volumes for anti-diabetic drugs in Australia. In the plots, can you see which month has the highest sales volume each year? What is unusual about the results in March and April 2008?
# ausbeer which contains quarterly beer production for Australia. What is happening to the beer production in Quarter 4?
# Load the fpp2 package
# library(fpp2)
data(a10, package="fpp2")
data(ausbeer, package="fpp2")
str(a10)
## Time-Series [1:204] from 1992 to 2008: 3.53 3.18 3.25 3.61 3.57 ...
str(ausbeer)
## Time-Series [1:218] from 1956 to 2010: 284 213 227 308 262 228 236 320 272 233 ...
# Create plots of the a10 data
autoplot(a10)
forecast::ggseasonplot(a10)
# Produce a polar coordinate season plot for the a10 data
forecast::ggseasonplot(a10, polar = TRUE)
# Restrict the ausbeer data to start in 1992
beer <- window(ausbeer, start=1992)
# Make plots of the beer data
autoplot(beer)
forecast::ggsubseriesplot(beer)
# In this exercise, you will work with the pre-loaded oil data (available in the package fpp2), which contains the annual oil production in Saudi Arabia from 1965-2013 (measured in millions of tons).
# Create an autoplot of the oil data
data(oil, package="fpp2")
str(oil)
## Time-Series [1:49] from 1965 to 2013: 111 131 141 154 163 ...
autoplot(oil)
# Create a lag plot of the oil data
forecast::gglagplot(oil)
# Create an ACF plot of the oil data
library(forecast)
ggAcf(oil)
# You will investigate this phenomenon by plotting the annual sunspot series (which follows the solar cycle of approximately 10-11 years) in sunspot.year
# and the daily traffic to the Hyndsight blog (which follows a 7-day weekly pattern) in hyndsight. Both objects have been loaded into your workspace.
# Plot the annual sunspot numbers
data(sunspot.year)
str(sunspot.year)
## Time-Series [1:289] from 1700 to 1988: 5 11 16 23 36 58 29 20 10 8 ...
autoplot(sunspot.year)
ggAcf(sunspot.year)
# Plot the traffic on the Hyndsight blog
data(hyndsight, package="fpp2")
str(hyndsight)
## Time-Series [1:365] from 1.43 to 53.4: 1157 1118 1310 874 890 1437 1263 1187 1506 1448 ...
autoplot(hyndsight)
ggAcf(hyndsight)
# You can test this hypothesis by looking at the goog series, which contains the closing stock price for Google over 1000 trading days ending on February 13, 2017. This data has been loaded into your workspace.
# Plot the original series
data(goog, package="fpp2")
str(goog)
## Time-Series [1:1000] from 1 to 1000: 393 393 397 398 400 ...
autoplot(goog)
# Plot the differenced series
autoplot(diff(goog))
# ACF of the differenced series
ggAcf(diff(goog))
# Ljung-Box test of the differenced series
Box.test(diff(goog), lag = 10, type = "Ljung")
##
## Box-Ljung test
##
## data: diff(goog)
## X-squared = 13.123, df = 10, p-value = 0.2169
Chapter 2 - Benchmark methods and forecast accuracy
Forecasts and potential futures:
Fitted values and residuals:
Training and test sets help to validate the forecasting methodology:
To subset observations from 101 to 500
train <- subset.ts(x, start = 101, end = 500, …)
To subset the first 500 observations
train <- subset.ts(x, end = 500, …)
Time series cross-validation attempts to solve some of the problems related to test-train:
Example code includes:
# Use naive() to forecast the goog series
fcgoog <- naive(goog, h=20)
# Plot and summarize the forecasts
autoplot(fcgoog)
summary(fcgoog)
##
## Forecast method: Naive method
##
## Model Information:
## Call: naive(y = goog, h = 20)
##
## Residual sd: 8.7285
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.4212612 8.734286 5.829407 0.06253998 0.9741428 1
## ACF1
## Training set 0.03871446
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1001 813.67 802.4765 824.8634 796.5511 830.7889
## 1002 813.67 797.8401 829.4999 789.4602 837.8797
## 1003 813.67 794.2824 833.0576 784.0192 843.3208
## 1004 813.67 791.2831 836.0569 779.4322 847.9078
## 1005 813.67 788.6407 838.6993 775.3910 851.9490
## 1006 813.67 786.2518 841.0882 771.7374 855.6025
## 1007 813.67 784.0549 843.2850 768.3777 858.9623
## 1008 813.67 782.0102 845.3298 765.2505 862.0895
## 1009 813.67 780.0897 847.2503 762.3133 865.0266
## 1010 813.67 778.2732 849.0667 759.5353 867.8047
## 1011 813.67 776.5456 850.7944 756.8931 870.4469
## 1012 813.67 774.8948 852.4452 754.3684 872.9715
## 1013 813.67 773.3115 854.0285 751.9470 875.3930
## 1014 813.67 771.7880 855.5520 749.6170 877.7230
## 1015 813.67 770.3180 857.0220 747.3688 879.9711
## 1016 813.67 768.8962 858.4437 745.1944 882.1455
## 1017 813.67 767.5183 859.8217 743.0870 884.2530
## 1018 813.67 766.1802 861.1597 741.0407 886.2993
## 1019 813.67 764.8789 862.4610 739.0505 888.2895
## 1020 813.67 763.6114 863.7286 737.1120 890.2280
# Use snaive() to forecast the ausbeer series
fcbeer <- snaive(ausbeer, h=16)
# Plot and summarize the forecasts
autoplot(fcbeer)
summary(fcbeer)
##
## Forecast method: Seasonal naive method
##
## Model Information:
## Call: snaive(y = ausbeer, h = 16)
##
## Residual sd: 19.1207
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 3.098131 19.32591 15.50935 0.838741 3.69567 1 0.01093868
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2010 Q3 419 394.2329 443.7671 381.1219 456.8781
## 2010 Q4 488 463.2329 512.7671 450.1219 525.8781
## 2011 Q1 414 389.2329 438.7671 376.1219 451.8781
## 2011 Q2 374 349.2329 398.7671 336.1219 411.8781
## 2011 Q3 419 383.9740 454.0260 365.4323 472.5677
## 2011 Q4 488 452.9740 523.0260 434.4323 541.5677
## 2012 Q1 414 378.9740 449.0260 360.4323 467.5677
## 2012 Q2 374 338.9740 409.0260 320.4323 427.5677
## 2012 Q3 419 376.1020 461.8980 353.3932 484.6068
## 2012 Q4 488 445.1020 530.8980 422.3932 553.6068
## 2013 Q1 414 371.1020 456.8980 348.3932 479.6068
## 2013 Q2 374 331.1020 416.8980 308.3932 439.6068
## 2013 Q3 419 369.4657 468.5343 343.2438 494.7562
## 2013 Q4 488 438.4657 537.5343 412.2438 563.7562
## 2014 Q1 414 364.4657 463.5343 338.2438 489.7562
## 2014 Q2 374 324.4657 423.5343 298.2438 449.7562
# Check the residuals from the naive forecasts applied to the goog series
goog %>% naive() %>% checkresiduals()
##
## Ljung-Box test
##
## data: Residuals from Naive method
## Q* = 13.123, df = 10, p-value = 0.2169
##
## Model df: 0. Total lags used: 10
# Do they look like white noise (TRUE or FALSE)
googwn <- TRUE
# Check the residuals from the seasonal naive forecasts applied to the ausbeer series
ausbeer %>% snaive() %>% checkresiduals()
##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 60.535, df = 8, p-value = 3.661e-10
##
## Model df: 0. Total lags used: 8
# Do they look like white noise (TRUE or FALSE)
beerwn <- FALSE
# The pre-loaded time series gold comprises daily gold prices for 1108 days. Here, you'll use the first 1000 days as a training set, and compute forecasts for the remaining 108 days
# Create the training data as train
train <- subset(gold, end = 1000)
# Compute naive forecasts and save to naive_fc
naive_fc <- naive(train, h = 108)
# Compute mean forecasts and save to mean_fc
mean_fc <- meanf(train, h = 108)
# Use accuracy() to compute RMSE statistics
accuracy(naive_fc, gold)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.09161392 6.33977 3.158386 0.01662141 0.794523 1.000000
## Test set -6.53834951 15.84236 13.638350 -1.74622688 3.428789 4.318139
## ACF1 Theil's U
## Training set -0.3098928 NA
## Test set 0.9793153 5.335899
accuracy(mean_fc, gold)
## ME RMSE MAE MPE MAPE MASE
## Training set -4.239671e-15 59.17809 53.63397 -2.390227 14.230224 16.981449
## Test set 1.319363e+01 19.55255 15.66875 3.138577 3.783133 4.960998
## ACF1 Theil's U
## Training set 0.9907254 NA
## Test set 0.9793153 6.123788
# Assign one of the two forecasts as bestforecasts
# bestforecasts <- naive_fc
# Here, you will use the Melbourne quarterly visitor numbers (vn[, "Melbourne"]) to create three different training sets, omitting the last 1, 2 and 3 years, respectively
# Inspect the pre-loaded vn data in your console before beginning the exercise
# This will help you determine the correct value to use for the keyword h (which specifies the number of values you want to forecast) in your forecasting methods
melData <- c(4.865, 4.113, 4.422, 5.171, 5.55, 4.009, 3.986, 3.839, 5.8, 4.229, 4.157, 4.627, 5.691, 4.601, 4.742, 5.733, 5.397, 3.884, 4.996, 5.304, 5.222, 4.765, 4.146, 4.717, 4.88, 4.868, 4.182, 4.214, 5.438, 3.87, 4.394, 4.404, 5.716, 5.291, 4.19, 4.712, 4.709, 4.489, 4.698, 5.193, 5.216, 4.215, 5.042, 5.089, 4.688, 4.393, 4.626, 4.88, 4.844, 4.437, 4.833, 4.622, 5.164, 4.504, 4.976, 4.508, 4.759, 4.835, 5.009, 5.693, 5.224, 4.82, 4.688, 4.918, 5.936, 5.44, 5.134, 5.993, 6.654, 5.342, 5.471, 5.812)
sydData <- c(7.319, 6.13, 6.284, 6.384, 6.602, 5.674, 5.715, 6.564, 6.602, 5.398, 7.172, 8.474, 7.012, 6.388, 6.073, 6.196, 5.633, 5.779, 5.869, 6.002, 6.202, 5.321, 5.161, 5.737, 6.168, 5.709, 5.057, 5.362, 5.902, 4.496, 5.093, 5.253, 6.832, 5.67, 5.008, 5.773, 6.529, 4.911, 4.784, 5.844, 6.252, 5.034, 5.263, 4.714, 5.362, 4.769, 4.125, 5.263, 6, 4.283, 5.256, 5.357, 6.194, 5.102, 5.596, 5.066, 6.684, 4.697, 5.366, 5.075, 5.499, 4.867, 5.71, 6.198, 6.416, 5.284, 5.483, 6.234, 6.938, 6.268, 5.562, 6.016)
vnFrame <- data.frame(Melbourne=melData, Sydney=sydData)
vn <- ts(vnFrame, start=c(1998, 1), frequency=4)
str(vn)
## Time-Series [1:72, 1:2] from 1998 to 2016: 4.87 4.11 4.42 5.17 5.55 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "Melbourne" "Sydney"
# Create three training series omitting the last 1, 2, and 3 years
train1 <- window(vn[, "Melbourne"], end = c(2014, 4))
train2 <- window(vn[, "Melbourne"], end = c(2013, 4))
train3 <- window(vn[, "Melbourne"], end = c(2012, 4))
# Produce forecasts using snaive()
fc1 <- snaive(train1, h = 4)
fc2 <- snaive(train2, h = 4)
fc3 <- snaive(train3, h = 4)
# Use accuracy() to compare the MAPE of each series
accuracy(fc1, vn[, "Melbourne"])["Test set", "MAPE"]
## [1] 5.474755
accuracy(fc2, vn[, "Melbourne"])["Test set", "MAPE"]
## [1] 12.50411
accuracy(fc3, vn[, "Melbourne"])["Test set", "MAPE"]
## [1] 7.954534
# Compute cross-validated errors for up to 8 steps ahead
e <- matrix(NA_real_, nrow = 1000, ncol = 8)
for (h in 1:8)
e[, h] <- tsCV(goog, forecastfunction = naive, h = h)
# Compute the MSE values and remove missing values
mse <- colMeans(e^2, na.rm = TRUE)
# Plot the MSE values against the forecast horizon
data.frame(h = 1:8, MSE = mse) %>%
ggplot(aes(x = h, y = MSE)) + geom_point()
Chapter 3 - Exponential smoothing
Exponentially weighted forecasts (simple exponential smoothing):
Exponential smoothing methods with trend:
Exponential smoothing methods with trend and seasonality (commonly known as the Holt-Winters method):
State space models for exponential smoothing:
Example code includes:
# You will also use summary() and fitted(), along with autolayer() for the first time, which is like autoplot() but it adds a "layer" to a plot rather than creating a new plot.
# Here, you will apply these functions to marathon, the annual winning times in the Boston marathon from 1897-2016. The data are available in your workspace.
# Use ses() to forecast the next 10 years of winning times
data(marathon, package="fpp2")
str(marathon)
## Time-Series [1:120] from 1897 to 2016: 175 162 175 160 149 ...
fc <- ses(marathon, h = 10)
# Use summary() to see the model parameters
summary(fc)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = marathon, h = 10)
##
## Smoothing parameters:
## alpha = 0.3457
##
## Initial states:
## l = 167.1765
##
## sigma: 5.4728
##
## AIC AICc BIC
## 988.4474 988.6543 996.8099
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.8875951 5.472771 3.826287 -0.7098466 2.637645 0.8925669
## ACF1
## Training set -0.01207536
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2017 130.3562 123.3425 137.3698 119.6297 141.0826
## 2018 130.3562 122.9353 137.7771 119.0069 141.7054
## 2019 130.3562 122.5492 138.1631 118.4165 142.2958
## 2020 130.3562 122.1814 138.5309 117.8539 142.8584
## 2021 130.3562 121.8294 138.8829 117.3156 143.3967
## 2022 130.3562 121.4914 139.2209 116.7987 143.9136
## 2023 130.3562 121.1658 139.5465 116.3008 144.4116
## 2024 130.3562 120.8514 139.8610 115.8199 144.8925
## 2025 130.3562 120.5470 140.1653 115.3544 145.3580
## 2026 130.3562 120.2518 140.4605 114.9029 145.8094
# Use autoplot() to plot the forecasts
autoplot(fc)
# Add the one-step forecasts for the training data to the plot
autoplot(fc) + autolayer(fitted(fc))
# Create a training set using subset.ts()
train <- subset(marathon, end = length(marathon) - 20)
# Compute SES and naive forecasts, save to fcses and fcnaive
fcses <- ses(train, h = 20)
fcnaive <- naive(train, h = 20)
# Calculate forecast accuracy measures
accuracy(fcses, marathon)
## ME RMSE MAE MPE MAPE MASE
## Training set -1.085512 5.863790 4.155943 -0.8606360 2.827999 0.8990895
## Test set 0.457428 2.493965 1.894228 0.3171688 1.463856 0.4097941
## ACF1 Theil's U
## Training set -0.01587645 NA
## Test set -0.12556096 0.6870722
accuracy(fcnaive, marathon)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.4638047 6.904742 4.622391 -0.4086317 3.123559 1.0000000
## Test set 0.2266667 2.462113 1.846667 0.1388780 1.429608 0.3995047
## ACF1 Theil's U
## Training set -0.3589323 NA
## Test set -0.1255610 0.6799062
# Save the best forecasts as fcbest
# fcbest <- fcnaive
# Here, you will apply it to the austa series, which contains annual counts of international visitors to Australia from 1980-2015 (in millions). The data has been pre-loaded into your workspace.
# Produce 10 year forecasts of austa using holt()
data(austa, package="fpp2")
str(austa)
## Time-Series [1:36] from 1980 to 2015: 0.83 0.86 0.877 0.867 0.932 ...
fcholt <- holt(austa, h=10)
# Look at fitted model using summary()
summary(fcholt)
##
## Forecast method: Holt's method
##
## Model Information:
## Holt's method
##
## Call:
## holt(y = austa, h = 10)
##
## Smoothing parameters:
## alpha = 0.9999
## beta = 1e-04
##
## Initial states:
## l = 0.5684
## b = 0.1755
##
## sigma: 0.1839
##
## AIC AICc BIC
## 17.08684 19.08684 25.00443
##
## Error measures:
## ME RMSE MAE MPE MAPE
## Training set -0.0006980015 0.1839059 0.1628927 -1.231661 6.322328
## MASE ACF1
## Training set 0.7994647 0.234277
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2016 7.034379 6.798694 7.270064 6.673930 7.394828
## 2017 7.209838 6.876529 7.543147 6.700086 7.719590
## 2018 7.385297 6.977065 7.793529 6.760960 8.009633
## 2019 7.560756 7.089350 8.032161 6.839804 8.281707
## 2020 7.736214 7.209144 8.263285 6.930129 8.542299
## 2021 7.911673 7.334269 8.489077 7.028610 8.794736
## 2022 8.087132 7.463435 8.710829 7.133269 9.040994
## 2023 8.262591 7.595798 8.929383 7.242820 9.282362
## 2024 8.438049 7.730775 9.145324 7.356366 9.519733
## 2025 8.613508 7.867939 9.359078 7.473258 9.753758
# Plot the forecasts
autoplot(fcholt)
# Check that the residuals look like white noise
checkresiduals(fcholt)
##
## Ljung-Box test
##
## data: Residuals from Holt's method
## Q* = 5.4561, df = 6, p-value = 0.4868
##
## Model df: 4. Total lags used: 10
# Here, you will apply hw() to a10, the monthly sales of anti-diabetic drugs in Australia from 1991 to 2008. The data are available in your workspace.
# Plot the data
data(a10, package="fpp2")
str(a10)
## Time-Series [1:204] from 1992 to 2008: 3.53 3.18 3.25 3.61 3.57 ...
autoplot(a10)
# Produce 3 year forecasts
fc <- hw(a10, seasonal = "multiplicative", h = 3)
# Check if residuals look like white noise
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from Holt-Winters' multiplicative method
## Q* = 55.57, df = 8, p-value = 3.421e-09
##
## Model df: 16. Total lags used: 24
whitenoise <- FALSE
# Plot forecasts
autoplot(fc)
# Here, you will compare an additive Holt-Winters method and a seasonal naive() method for the hyndsight data, which contains the daily pageviews on the Hyndsight blog for one year starting April 30, 2014
# Create training data with subset()
train <- subset(hyndsight, end = length(hyndsight) - 28)
# Holt-Winters additive forecasts as fchw
fchw <- hw(train, seasonal = "additive", h = 28)
# Seasonal naive forecasts as fcsn
fcsn <- snaive(train, h=28)
# Find better forecasts with accuracy()
accuracy(fchw, hyndsight)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.06894788 230.0922 164.2099 -2.6329101 13.848691 0.7455151
## Test set 41.12766744 180.6719 143.8484 0.9333033 9.254484 0.6530736
## ACF1 Theil's U
## Training set 0.2145322 NA
## Test set 0.2337612 0.4233897
accuracy(fcsn, hyndsight)
## ME RMSE MAE MPE MAPE MASE
## Training set 10.50 310.3282 220.2636 -2.1239387 18.01077 1.0000000
## Test set 0.25 202.7610 160.4643 -0.6888732 10.25880 0.7285101
## ACF1 Theil's U
## Training set 0.4255730 NA
## Test set 0.3089795 0.450266
# Plot the better forecasts
autoplot(fchw)
# Fit ETS model to austa in fitaus
fitaus <- ets(austa)
# Check residuals
checkresiduals(fitaus)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,A,N)
## Q* = 5.4561, df = 6, p-value = 0.4868
##
## Model df: 4. Total lags used: 10
# Plot forecasts
autoplot(forecast(fitaus))
# Repeat for hyndsight data in fiths
fiths <- ets(hyndsight)
checkresiduals(fiths)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 65.856, df = 5, p-value = 7.444e-13
##
## Model df: 9. Total lags used: 14
autoplot(forecast(fiths))
# Which model(s) fails test? (TRUE or FALSE)
fitausfail <- FALSE
fithsfail <- TRUE
# Function to return ETS forecasts
fets <- function(y, h) {
forecast(ets(y), h = h)
}
data(qcement, package="fpp2")
str(qcement)
## Time-Series [1:233] from 1956 to 2014: 0.465 0.532 0.561 0.57 0.529 0.604 0.603 0.582 0.554 0.62 ...
cement <- window(qcement, start=1994)
str(cement)
## Time-Series [1:81] from 1994 to 2014: 1.47 1.75 1.96 1.83 1.63 ...
# Apply tsCV() for both methods
e1 <- tsCV(cement, fets, h = 4)
e2 <- tsCV(cement, snaive, h = 4)
# Compute MSE of resulting errors (watch out for missing values)
mean(e1^2, na.rm=TRUE)
## [1] 0.04442133
mean(e2^2, na.rm=TRUE)
## [1] 0.02921384
# Copy the best forecast MSE
bestmse <- mean(e2^2, na.rm=TRUE)
# Computing the ETS does not work well for all series
# Here, you will observe why it does not work well for the annual Canadian lynx population available in your workspace as lynx
# Plot the lynx series
data(lynx)
str(lynx)
## Time-Series [1:114] from 1821 to 1934: 269 321 585 871 1475 ...
autoplot(lynx)
# Use ets() to model the lynx series
fit <- ets(lynx)
# Use summary() to look at model and parameters
summary(fit)
## ETS(M,N,N)
##
## Call:
## ets(y = lynx)
##
## Smoothing parameters:
## alpha = 0.9999
##
## Initial states:
## l = 169.2223
##
## sigma: 0.9489
##
## AIC AICc BIC
## 2052.369 2052.587 2060.578
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 28.30726 1182.181 824.4878 -44.94341 94.83425 0.9923325
## ACF1
## Training set 0.3785704
# Plot 20-year forecasts of the lynx series
fit %>% forecast(h=20) %>% autoplot()
Chapter 4 - Forecasting with ARIMA Models
Transformations for variance stabilization:
ARIMA models - AutoRegressive Integrated Moving Average:
Seasonal ARIMA models - just needs a lot more differencing and lags:
Example code includes:
# Plot the series
autoplot(a10)
# Try four values of lambda in Box-Cox transformations
a10 %>% BoxCox(lambda = 0.0) %>% autoplot()
a10 %>% BoxCox(lambda = 0.1) %>% autoplot()
a10 %>% BoxCox(lambda = 0.2) %>% autoplot()
a10 %>% BoxCox(lambda = 0.3) %>% autoplot()
# Compare with BoxCox.lambda()
BoxCox.lambda(a10)
## [1] 0.1313326
# In this exercise, you will use the pre-loaded wmurders data, which contains the annual female murder rate in the US from 1950-2004
data(wmurders, package="fpp2")
# Plot the US female murder rate
autoplot(wmurders)
# Plot the differenced murder rate
autoplot(diff(wmurders))
# Plot the ACF of the differenced murder rate
ggAcf(diff(wmurders))
# In this exercise, you will use differencing and transformations simultaneously to make a time series look stationary. The data set here is h02, which contains 17 years of monthly corticosteroid drug sales in Australia
data(h02, package="fpp2")
str(h02)
## Time-Series [1:204] from 1992 to 2008: 0.43 0.401 0.432 0.493 0.502 ...
# Plot the data
autoplot(h02)
# Take logs and seasonal differences of h02
difflogh02 <- diff(log(h02), lag = 12)
# Plot difflogh02
autoplot(difflogh02)
# Take another difference and plot
ddifflogh02 <- diff(difflogh02)
autoplot(ddifflogh02)
# Plot ACF of ddifflogh02
ggAcf(ddifflogh02)
# Fit an automatic ARIMA model to the austa series
fit <- auto.arima(austa)
# Check that the residuals look like white noise
checkresiduals(fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1) with drift
## Q* = 3.2552, df = 8, p-value = 0.9173
##
## Model df: 2. Total lags used: 10
residualsok <- TRUE
# Summarize the model
summary(fit)
## Series: austa
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## 0.3006 0.1735
## s.e. 0.1647 0.0390
##
## sigma^2 estimated as 0.03376: log likelihood=10.62
## AIC=-15.24 AICc=-14.46 BIC=-10.57
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0008313383 0.1759116 0.1520309 -1.069983 5.513269 0.7461559
## ACF1
## Training set -0.000571993
# Find the AICc value and the number of differences used
AICc <- round(fit$aicc, 2)
d <- 1
# Plot forecasts of fit
fit %>% forecast(h = 10) %>% autoplot()
# Plot forecasts from an ARIMA(0,1,1) model with no drift
austa %>% Arima(order = c(0, 1, 1), include.constant = FALSE) %>% forecast() %>% autoplot()
# Plot forecasts from an ARIMA(2,1,3) model with drift
austa %>% Arima(order = c(2, 1, 3), include.constant = TRUE) %>% forecast() %>% autoplot()
# Plot forecasts from an ARIMA(0,0,1) model with a constant
austa %>% Arima(order = c(0, 0, 1), include.constant = TRUE) %>% forecast() %>% autoplot()
# Plot forecasts from an ARIMA(0,2,1) model with no constant
austa %>% Arima(order = c(0, 2, 1), include.constant = FALSE) %>% forecast() %>% autoplot()
# Set up forecast functions for ETS and ARIMA models
fets <- function(x, h) {
forecast(ets(x), h = h)
}
farima <- function(x, h) {
forecast(auto.arima(x), h=h)
}
# Compute CV errors for ETS as e1
e1 <- tsCV(austa, fets, h=1)
# Compute CV errors for ARIMA as e2
e2 <- tsCV(austa, farima, h=1)
# Find MSE of each model class
mean(e1**2, na.rm=TRUE)
## [1] 0.05684574
mean(e2**2, na.rm=TRUE)
## [1] 0.04336277
# Plot 10-year forecasts using the best model class
austa %>% farima(h=10) %>% autoplot()
# Check that the logged h02 data have stable variance
h02 %>% log() %>% autoplot()
# Fit a seasonal ARIMA model to h02 with lambda = 0
fit <- auto.arima(h02, lambda=0)
# Summarize the fitted model
summary(fit)
## Series: h02
## ARIMA(2,1,3)(0,1,1)[12]
## Box Cox transformation: lambda= 0
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 sma1
## -1.0194 -0.8351 0.1717 0.2578 -0.4206 -0.6528
## s.e. 0.1648 0.1203 0.2079 0.1177 0.1060 0.0657
##
## sigma^2 estimated as 0.004203: log likelihood=250.8
## AIC=-487.6 AICc=-486.99 BIC=-464.83
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set -0.003823286 0.05006017 0.03588577 -0.643286 4.52991
## MASE ACF1
## Training set 0.5919957 -0.007519928
# Record the amount of lag-1 differencing and seasonal differencing used
d <- 1
D <- 1
# Plot 2-year forecasts
fit %>% forecast(h=24) %>% autoplot()
# Find an ARIMA model for euretail
data(euretail, package="fpp2")
str(euretail)
## Time-Series [1:64] from 1996 to 2012: 89.1 89.5 89.9 90.1 89.2 ...
fit1 <- auto.arima(euretail)
# Don't use a stepwise search
fit2 <- auto.arima(euretail, stepwise=FALSE)
# AICc of better model
AICc <- round(min(fit1$aicc, fit2$aicc), 2)
# Compute 2-year forecasts from better model
fit2 %>% forecast(h=8) %>% autoplot()
# In the final exercise for this chapter, you will compare seasonal ARIMA and ETS models applied to the quarterly cement production data qcement
# Because the series is very long, you can afford to use a training and test set rather than time series cross-validation. This is much faster
# Use 20 years of the qcement data beginning in 1988
train <- window(qcement, start = c(1988, 1), end = c(2007, 4))
# Fit an ARIMA and an ETS model to the training data
fit1 <- auto.arima(train)
fit2 <- ets(train)
# Check that both models have white noise residuals
checkresiduals(fit1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,0)(2,1,1)[4] with drift
## Q* = 3.5497, df = 3, p-value = 0.3144
##
## Model df: 6. Total lags used: 9
checkresiduals(fit2)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,M)
## Q* = 7.1565, df = 3, p-value = 0.06707
##
## Model df: 6. Total lags used: 9
# Produce forecasts for each model
fc1 <- forecast(fit1, h = length(window(qcement, start=2008)))
fc2 <- forecast(fit2, h = length(window(qcement, start=2008)))
# Use accuracy() to find better model based on RMSE
accuracy(fc1, qcement)
## ME RMSE MAE MPE MAPE
## Training set -0.005847264 0.1005547 0.07954692 -0.6550065 4.347836
## Test set -0.158355693 0.1978197 0.16749405 -7.3004639 7.653446
## MASE ACF1 Theil's U
## Training set 0.5434705 -0.01315073 NA
## Test set 1.1443319 0.28848376 0.7226779
accuracy(fc2, qcement)
## ME RMSE MAE MPE MAPE
## Training set 0.01497655 0.1017825 0.07904747 0.5666991 4.332964
## Test set -0.13546124 0.1836477 0.15332875 -6.2632105 6.954592
## MASE ACF1 Theil's U
## Training set 0.5400583 -0.03949197 NA
## Test set 1.0475535 0.54271907 0.6810648
# bettermodel <- fc2
Chapter 5 - Advanced Methods
Dynamic Regression - could include factors like advertising or competition in a single model:
Dynamic Harmonic Regression - handling periodic seasonality with Fourier terms:
args(tslm)
TBATS models - combines many models in to a single model:
Wrap up:
Example code includes:
# In this exercise, you will model sales data regressed against advertising expenditure, with an ARMA error to account for any serial correlation in the regression errors
# The data are available in your workspace as advert and comprise 24 months of sales and advertising expenditure for an automotive parts company
data(advert, package="fma")
str(advert)
## Time-Series [1:24, 1:2] from 1 to 24: 25 0 15 10 20 10 5 5 15 15 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "advert" "sales"
# Time plot of both variables
autoplot(advert, facets=TRUE)
# Fit ARIMA model
fit <- auto.arima(advert[, "sales"], xreg = advert[, "advert"], stationary = TRUE)
# Check model. Increase in sales for each unit increase in advertising
salesincrease <- coefficients(fit)[3]
# Forecast fit as fc
fc <- forecast(fit, xreg = rep(10, 6))
# Plot fc with x and y labels
autoplot(fc) + xlab("Month") + ylab("Sales")
data(elecdaily, package="fpp2")
str(elecdaily)
## Time-Series [1:365, 1:3] from 1 to 53: 175 189 189 174 170 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:3] "Demand" "WorkDay" "Temperature"
elec <- elecdaily
colnames(elec)[2] <- "Workday"
# Time plots of demand and temperatures
autoplot(elec[, c("Demand", "Temperature")], facets = TRUE)
# Matrix of regressors
xreg <- cbind(MaxTemp = elec[, "Temperature"],
MaxTempSq = elec[, "Temperature"] ** 2,
Workday = elec[, "Workday"])
# Fit model
fit <- auto.arima(elec[, "Demand"], xreg = xreg)
# Forecast fit one day ahead
forecast(fit, xreg = cbind(20, 20**2, 1))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 53.14286 185.4008 176.9271 193.8745 172.4414 198.3602
# The pre-loaded gasoline data comprises weekly data on US finished motor gasoline products
# In this exercise, you will fit a harmonic regression to this data set and forecast the next 3 years
data(gasoline, package="fpp2")
str(gasoline)
## Time-Series [1:1355] from 1991 to 2017: 6621 6433 6582 7224 6875 ...
# Set up harmonic regressors of order 13
harmonics <- fourier(gasoline, K = 13)
# Fit regression model with ARIMA errors
fit <- auto.arima(gasoline, xreg = harmonics, seasonal = FALSE)
# Forecasts next 3 years
newharmonics <- fourier(gasoline, K = 13, h = 156)
fc <- forecast(fit, xreg = newharmonics)
# Plot forecasts fc
autoplot(fc)
# Fit a harmonic regression using order 10 for each type of seasonality
fit <- tslm(taylor ~ fourier(taylor, K = c(10, 10)))
# Forecast 20 working days ahead
fc <- forecast(fit, newdata = data.frame(fourier(taylor, K = c(10, 10), h = 20 * 48)))
# Plot the forecasts
autoplot(fc)
# Check the residuals of fit
checkresiduals(fit)
##
## Breusch-Godfrey test for serial correlation of order up to 672
##
## data: Residuals from Linear regression model
## LM test = 3938.9, df = 672, p-value < 2.2e-16
# Another time series with multiple seasonal periods is calls, which contains 20 consecutive days of 5-minute call volume data for a large North American bank
# There are 169 5-minute periods in a working day, and so the weekly seasonal frequency is 5 x 169 = 845
# The weekly seasonality is relatively weak, so here you will just model daily seasonality. calls is pre-loaded into your workspace
# The residuals in this case still fail the white noise tests, but their autocorrelations are tiny, even though they are significant
# This is because the series is so long. It is often unrealistic to have residuals that pass the tests for such long series
# The effect of the remaining correlations on the forecasts will be negligible
data(calls, package="fpp2")
calls <- window(calls, start=29.8)
str(calls)
## Time-Series [1:3380] from 29.8 to 33.8: 98 83 89 87 71 85 76 81 86 94 ...
## - attr(*, "names")= chr [1:3380] "X26.09.20031" "X26.09.20032" "X26.09.20033" "X26.09.20034" ...
## - attr(*, "msts")= num [1:2] 169 845
# Plot the calls data
autoplot(calls)
# Set up the xreg matrix
xreg <- fourier(calls, K = c(10, 0))
# Fit a dynamic regression model
fit <- auto.arima(calls, xreg = xreg, seasonal=FALSE, stationary=TRUE)
# Check the residuals
checkresiduals(fit)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(3,0,1) errors
## Q* = 1843.9, df = 1665, p-value = 0.001318
##
## Model df: 25. Total lags used: 1690
# Plot forecasts for 10 working days ahead
fc <- forecast(fit, xreg = fourier(calls, c(10, 0), h = 10 * 169))
autoplot(fc)
# The gas data contains Australian monthly gas production
# A plot of the data shows the variance has changed a lot over time, so it needs a transformation
# The seasonality has also changed shape over time, and there is a strong trend
# This makes it an ideal series to test the tbats() function which is designed to handle these features
# Plot the gas data
autoplot(gas)
# Fit a TBATS model to the gas data
fit <- tbats(gas)
# Forecast the series for the next 5 years
fc <- forecast(fit, h=60)
# Plot the forecasts
autoplot(fc)
# Record the Box-Cox parameter and the order of the Fourier terms
lambda <- round(as.vector(fc$model$lambda), 3) # 0.082
K <- fc$model$k.vector #5
Chapter 1 - Introduction to Networks
What are social networks?
Network attributes - may want to add information about vertices and edges:
Network visualization principles - many options for creating and customizing the display:
Example code includes:
# Load igraph
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:lubridate':
##
## %--%, union
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
friends <- readr::read_csv("./RInputFiles/friends.csv")
## Parsed with column specification:
## cols(
## name1 = col_character(),
## name2 = col_character()
## )
str(friends)
## Classes 'tbl_df', 'tbl' and 'data.frame': 27 obs. of 2 variables:
## $ name1: chr "Jessie" "Jessie" "Sidney" "Sidney" ...
## $ name2: chr "Sidney" "Britt" "Britt" "Donnie" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ name1: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ name2: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Inspect the first few rows of the dataframe 'friends'
head(friends)
## # A tibble: 6 x 2
## name1 name2
## <chr> <chr>
## 1 Jessie Sidney
## 2 Jessie Britt
## 3 Sidney Britt
## 4 Sidney Donnie
## 5 Karl Berry
## 6 Sidney Rene
# Convert friends dataframe to a matrix
friends.mat <- as.matrix(friends)
# Convert friends matrix to an igraph object
g <- graph.edgelist(friends.mat, directed = FALSE)
# Make a very basic plot of the network
plot(g)
# Subset vertices and edges
V(g)
## + 16/16 vertices, named, from 3390e08:
## [1] Jessie Sidney Britt Donnie Karl Berry Rene Shayne
## [9] Elisha Whitney Odell Lacy Eugene Jude Rickie Tommy
E(g)
## + 27/27 edges from 3390e08 (vertex names):
## [1] Jessie --Sidney Jessie --Britt Sidney --Britt Sidney --Donnie
## [5] Karl --Berry Sidney --Rene Britt --Rene Sidney --Shayne
## [9] Sidney --Elisha Sidney --Whitney Jessie --Whitney Donnie --Odell
## [13] Sidney --Odell Rene --Whitney Donnie --Shayne Jessie --Lacy
## [17] Rene --Lacy Elisha --Eugene Eugene --Jude Berry --Odell
## [21] Odell --Rickie Karl --Odell Britt --Lacy Elisha --Jude
## [25] Whitney--Lacy Britt --Whitney Karl --Tommy
# Count number of edges
gsize(g)
## [1] 27
# Count number of vertices
gorder(g)
## [1] 16
# Inspect the objects 'genders' and 'ages'
genders <- c('M', 'F', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M')
ages <- c(18, 19, 21, 20, 22, 18, 23, 21, 22, 20, 20, 22, 21, 18, 19, 20)
# Create new vertex attribute called 'gender'
g <- set_vertex_attr(g, "gender", value = genders)
# Create new vertex attribute called 'age'
g <- set_vertex_attr(g, "age", value = ages)
# View all vertex attributes in a list
vertex_attr(g)
## $name
## [1] "Jessie" "Sidney" "Britt" "Donnie" "Karl" "Berry" "Rene"
## [8] "Shayne" "Elisha" "Whitney" "Odell" "Lacy" "Eugene" "Jude"
## [15] "Rickie" "Tommy"
##
## $gender
## [1] "M" "F" "F" "M" "M" "M" "F" "M" "M" "F" "M" "F" "M" "F" "M" "M"
##
## $age
## [1] 18 19 21 20 22 18 23 21 22 20 20 22 21 18 19 20
# View attributes of first five vertices in a dataframe
V(g)[[1:5]]
## + 5/16 vertices, named, from 3390e08:
## name gender age
## 1 Jessie M 18
## 2 Sidney F 19
## 3 Britt F 21
## 4 Donnie M 20
## 5 Karl M 22
# View hours
hours <- c(1, 2, 2, 1, 2, 5, 5, 1, 1, 3, 2, 1, 1, 5, 1, 2, 4, 1, 3, 1, 1, 1, 4, 1, 3, 3, 4)
# Create new edge attribute called 'hours'
g <- set_edge_attr(g, "hours", value = hours)
# View edge attributes of graph object
edge_attr(g)
## $hours
## [1] 1 2 2 1 2 5 5 1 1 3 2 1 1 5 1 2 4 1 3 1 1 1 4 1 3 3 4
# Find all edges that include "Britt"
E(g)[[inc('Britt')]]
## + 5/27 edges from 3390e08 (vertex names):
## tail head tid hid hours
## 2 Jessie Britt 1 3 2
## 3 Sidney Britt 2 3 2
## 7 Britt Rene 3 7 5
## 23 Britt Lacy 3 12 4
## 26 Britt Whitney 3 10 3
# Find all pairs that spend 4 or more hours together per week
E(g)[[hours>=4]]
## + 6/27 edges from 3390e08 (vertex names):
## tail head tid hid hours
## 6 Sidney Rene 2 7 5
## 7 Britt Rene 3 7 5
## 14 Rene Whitney 7 10 5
## 17 Rene Lacy 7 12 4
## 23 Britt Lacy 3 12 4
## 27 Karl Tommy 5 16 4
friends1_nodes <- readr::read_csv("./RInputFiles/friends1_nodes.csv")
## Parsed with column specification:
## cols(
## name = col_character(),
## gender = col_character()
## )
str(friends1_nodes)
## Classes 'tbl_df', 'tbl' and 'data.frame': 19 obs. of 2 variables:
## $ name : chr "Joe" "Erin" "Kelley" "Ronald" ...
## $ gender: chr "M" "F" "F" "M" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ name : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ gender: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
friends1_edges <- readr::read_csv("./RInputFiles/friends1_edges.csv")
## Parsed with column specification:
## cols(
## name1 = col_character(),
## name2 = col_character(),
## hours = col_integer()
## )
str(friends1_edges)
## Classes 'tbl_df', 'tbl' and 'data.frame': 25 obs. of 3 variables:
## $ name1: chr "Joe" "Joe" "Joe" "Erin" ...
## $ name2: chr "Ronald" "Michael" "Troy" "Kelley" ...
## $ hours: int 1 3 2 3 5 1 3 5 2 1 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 3
## .. ..$ name1: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ name2: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ hours: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Create an igraph object with attributes directly from dataframes
g1 <- graph_from_data_frame(d = friends1_edges, vertices = friends1_nodes, directed = FALSE)
# Subset edges greater than or equal to 5 hours
E(g1)[[hours >= 5]]
## + 4/25 edges from 33ab6dc (vertex names):
## tail head tid hid hours
## 5 Kelley Valentine 3 6 5
## 8 Ronald Jasmine 4 8 5
## 12 Valentine Perry 6 15 5
## 15 Jasmine Juan 8 9 6
# Plot network and color vertices by gender
V(g1)$color <- ifelse(V(g1)$gender == "F", "orange", "dodgerblue")
plot(g1, vertex.label.color = "black")
# Plot the graph object g1 in a circle layout
plot(g1, vertex.label.color = "black", layout = layout_in_circle(g1))
# Plot the graph object g1 in a Fruchterman-Reingold layout
plot(g1, vertex.label.color = "black", layout = layout_with_fr(g1))
# Plot the graph object g1 in a Tree layout
m <- layout_as_tree(g1)
plot(g1, vertex.label.color = "black", layout = m)
# Plot the graph object g1 using igraph's chosen layout
m1 <- layout_nicely(g1)
plot(g1, vertex.label.color = "black", layout = m1)
# Create a vector of weights based on the number of hours each pair spend together
w1 <- E(g1)$hours
# Plot the network varying edges by weights
m1 <- layout_nicely(g1)
plot(g1,
vertex.label.color = "black",
edge.color = 'black',
edge.width = w1,
layout = m1)
# Create a new igraph object only including edges from the original graph that are greater than 2 hours long
g2 <- delete_edges(g1, E(g1)[hours < 2])
# Plot the new graph
w2 <- E(g2)$hours
m2 <- layout_nicely(g2)
plot(g2,
vertex.label.color = "black",
edge.color = 'black',
edge.width = w2,
layout = m2)
Chapter 2 - Identifying Important Vertices in a Network
Directed Networks - arrows represent the from-to relationship, such as e-mail exchanges:
Relationship between Vertices - overall patterns between networks (neighbors and paths):
Significant nodes in a network:
Example code includes:
measles <- readr::read_csv("./RInputFiles/measles.csv")
## Parsed with column specification:
## cols(
## from = col_integer(),
## to = col_integer()
## )
str(measles)
## Classes 'tbl_df', 'tbl' and 'data.frame': 184 obs. of 2 variables:
## $ from: int 45 45 172 180 45 180 42 45 182 45 ...
## $ to : int 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ from: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ to : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
# Get the graph object
g <- graph_from_data_frame(measles, directed = TRUE)
# is the graph directed?
is.directed(g)
## [1] TRUE
# Is the graph weighted?
is.weighted(g)
## [1] FALSE
# Where does each edge originate from?
table(head_of(g, E(g)))
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 18 19
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 56 57 58 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 184 185 186 187
## 1 1 1 1
# Make a basic plot
plot(g,
vertex.label.color = "black",
edge.color = 'gray77',
vertex.size = 0,
edge.arrow.size = 0.1,
layout = layout_nicely(g))
# Is there an edge going from vertex 184 to vertex 178?
g['184', '178']
## [1] 1
# Is there an edge going from vertex 178 to vertex 184?
g['178', '184']
## [1] 0
# Show all edges going to or from vertex 184
incident(g, '184', mode = c("all"))
## + 6/184 edges from 344035d (vertex names):
## [1] 184->45 184->182 184->181 184->178 184->183 184->177
# Show all edges going out from vertex 184
incident(g, '184', mode = c("out"))
## + 6/184 edges from 344035d (vertex names):
## [1] 184->45 184->182 184->181 184->178 184->183 184->177
# Identify all neighbors of vertex 12 regardless of direction
neighbors(g, '12', mode = c('all'))
## + 5/187 vertices, named, from 344035d:
## [1] 45 13 72 89 109
# Identify other vertices that direct edges towards vertex 12
neighbors(g, '12', mode = c('in'))
## + 1/187 vertex, named, from 344035d:
## [1] 45
# Identify any vertices that receive an edge from vertex 42 and direct an edge to vertex 124
n1 <- neighbors(g, '42', mode = c('out'))
n2 <- neighbors(g, '124', mode = c('in'))
intersection(n1, n2)
## + 1/187 vertex, named, from 344035d:
## [1] 7
# Which two vertices are the furthest apart in the graph ?
farthest_vertices(g)
## $vertices
## + 2/187 vertices, named, from 344035d:
## [1] 184 162
##
## $distance
## [1] 5
# Shows the path sequence between two furthest apart vertices.
get_diameter(g)
## + 6/187 vertices, named, from 344035d:
## [1] 184 178 42 7 123 162
# Identify vertices that are reachable within two connections from vertex 42
ego(g, 2, '42', mode = c('out'))
## [[1]]
## + 13/187 vertices, named, from 344035d:
## [1] 42 7 106 43 123 101 120 124 125 128 129 108 127
# Identify vertices that can reach vertex 42 within two connections
ego(g, 2, '42', mode = c('in'))
## [[1]]
## + 3/187 vertices, named, from 344035d:
## [1] 42 178 184
# Calculate the out-degree of each vertex
g.outd <- degree(g, mode = c("out"))
# View a summary of out-degree
table(g.outd)
## g.outd
## 0 1 2 3 4 6 7 8 30
## 125 21 16 12 6 2 3 1 1
# Make a histogram of out-degrees
hist(g.outd, breaks = 30)
# Find the vertex that has the maximum out-degree
which.max(g.outd)
## 45
## 1
# Calculate betweenness of each vertex
g.b <- betweenness(g, directed = TRUE)
# Show histogram of vertex betweenness
hist(g.b, breaks = 80)
# Create plot with vertex size determined by betweenness score
plot(g,
vertex.label = NA,
edge.color = 'black',
vertex.size = sqrt(g.b)+1,
edge.arrow.size = 0.05,
layout = layout_nicely(g))
# Make an ego graph
g184 <- make_ego_graph(g, diameter(g), nodes = '184', mode = c("all"))[[1]]
# Get a vector of geodesic distances of all vertices from vertex 184
dists <- distances(g184, "184")
# Create a color palette of length equal to the maximal geodesic distance plus one.
colors <- c("black", "red", "orange", "blue", "dodgerblue", "cyan")
# Set color attribute to vertices of network g184.
V(g184)$color <- colors[dists+1]
# Visualize the network based on geodesic distance from vertex 184 (patient zero).
plot(g184,
vertex.label = dists,
vertex.label.color = "white",
vertex.label.cex = .6,
edge.color = 'black',
vertex.size = 7,
edge.arrow.size = .05,
main = "Geodesic Distances from Patient Zero"
)
Chapter 3 - Characterizing Network Structures